aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVitaly Buka <vitalybuka@google.com>2024-10-16 18:31:37 -0700
committerVitaly Buka <vitalybuka@google.com>2024-10-16 18:31:37 -0700
commitb47049c92e0558a726eff2f0d501e22cca3ca63d (patch)
tree6b55981aa6e162e666325d2f0092ceba4fbfb4d4
parentc271c489aa9aa7f1a7d93a037d5633428853acdd (diff)
parentdd9a34fd7e6cb190d44d310a610e9f959e2e599f (diff)
downloadllvm-users/vitalybuka/spr/main.nfclsan-restructure-loop-in-processthreads.zip
llvm-users/vitalybuka/spr/main.nfclsan-restructure-loop-in-processthreads.tar.gz
llvm-users/vitalybuka/spr/main.nfclsan-restructure-loop-in-processthreads.tar.bz2
[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.nfclsan-restructure-loop-in-processthreads
Created using spr 1.3.4 [skip ci]
-rw-r--r--clang-tools-extra/clang-tidy/ClangTidy.cpp1
-rw-r--r--clang/docs/ReleaseNotes.rst15
-rw-r--r--clang/docs/SafeBuffers.rst585
-rw-r--r--clang/docs/analyzer/checkers.rst17
-rw-r--r--clang/docs/index.rst1
-rw-r--r--clang/include/clang/AST/DeclTemplate.h18
-rw-r--r--clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h217
-rw-r--r--clang/include/clang/CodeGen/CodeGenABITypes.h24
-rw-r--r--clang/include/clang/Driver/Options.td2
-rw-r--r--clang/include/clang/ExtractAPI/API.h37
-rw-r--r--clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def13
-rw-r--r--clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h8
-rw-r--r--clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h11
-rw-r--r--clang/lib/AST/DeclTemplate.cpp28
-rw-r--r--clang/lib/AST/ExprConstant.cpp43
-rw-r--r--clang/lib/Basic/Targets/OSTargets.cpp6
-rw-r--r--clang/lib/CodeGen/CGBuiltin.cpp3
-rw-r--r--clang/lib/CodeGen/CodeGenABITypes.cpp25
-rw-r--r--clang/lib/CodeGen/CodeGenModule.cpp1
-rw-r--r--clang/lib/CodeGen/Targets/DirectX.cpp1
-rw-r--r--clang/lib/Driver/ToolChains/Clang.cpp1
-rw-r--r--clang/lib/Driver/ToolChains/CommonArgs.cpp10
-rw-r--r--clang/lib/Sema/SemaDeclCXX.cpp2
-rw-r--r--clang/lib/Sema/SemaOpenACC.cpp2
-rw-r--r--clang/lib/Sema/SemaTemplateInstantiate.cpp4
-rw-r--r--clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp2
-rw-r--r--clang/lib/StaticAnalyzer/Core/ExprEngine.cpp41
-rw-r--r--clang/test/Analysis/string.c46
-rw-r--r--clang/test/Analysis/string.cpp10
-rw-r--r--clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp284
-rw-r--r--clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl20
-rw-r--r--clang/test/CodeGenHLSL/builtins/sign.hlsl8
-rw-r--r--clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl4
-rw-r--r--clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl4
-rw-r--r--clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl4
-rw-r--r--clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl4
-rw-r--r--clang/test/CodeGenOpenCL/addr-space-struct-arg.cl1252
-rw-r--r--clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl122
-rw-r--r--clang/test/CodeGenOpenCL/pipe_builtin.cl21
-rw-r--r--clang/test/Driver/cl-options.c3
-rw-r--r--clang/test/ExtractAPI/anonymous_record_no_typedef.c44
-rw-r--r--clang/test/ExtractAPI/typedef_anonymous_record.c27
-rw-r--r--clang/test/Modules/friend-definition-2.cpp59
-rw-r--r--clang/test/Preprocessor/predefined-win-macros.c7
-rw-r--r--clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp16
-rw-r--r--clang/test/SemaCXX/virtual-override.cpp6
-rw-r--r--clang/unittests/Analysis/FlowSensitive/CMakeLists.txt1
-rw-r--r--clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp305
-rw-r--r--clang/utils/TableGen/MveEmitter.cpp51
-rw-r--r--compiler-rt/lib/orc/dlfcn_wrapper.cpp8
-rw-r--r--compiler-rt/lib/orc/macho_platform.cpp14
-rw-r--r--compiler-rt/lib/orc/macho_platform.h2
-rw-r--r--compiler-rt/lib/rtsan/rtsan_assertions.h3
-rw-r--r--compiler-rt/lib/rtsan/rtsan_checks.inc1
-rw-r--r--compiler-rt/lib/rtsan/rtsan_suppressions.cpp13
-rw-r--r--compiler-rt/lib/rtsan/rtsan_suppressions.h1
-rw-r--r--compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp2
-rw-r--r--compiler-rt/test/profile/Posix/instrprof-visibility.cpp1
-rw-r--r--compiler-rt/test/profile/coverage-inline.cpp1
-rw-r--r--compiler-rt/test/profile/coverage_comments.cpp1
-rw-r--r--compiler-rt/test/profile/coverage_emptylines.cpp1
-rw-r--r--compiler-rt/test/profile/instrprof-merging.cpp1
-rw-r--r--compiler-rt/test/profile/instrprof-set-file-object-merging.c1
-rw-r--r--compiler-rt/test/profile/instrprof-set-file-object.c1
-rw-r--r--compiler-rt/test/profile/instrprof-without-libc.c1
-rw-r--r--compiler-rt/test/profile/instrprof-write-file-only.c1
-rw-r--r--compiler-rt/test/profile/lit.cfg.py8
-rw-r--r--compiler-rt/test/rtsan/stack_suppressions.cpp25
-rw-r--r--compiler-rt/test/rtsan/stack_suppressions.cpp.supp5
-rw-r--r--flang/lib/Evaluate/intrinsics-library.cpp2
-rw-r--r--flang/lib/Lower/OpenMP/OpenMP.cpp16
-rw-r--r--flang/runtime/Float128Math/math-entries.h9
-rw-r--r--flang/test/Driver/atomic.f905
-rw-r--r--flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90262
-rw-r--r--flang/test/Integration/OpenMP/private-global.f9046
-rw-r--r--flang/test/Lower/OpenMP/simd.f9024
-rw-r--r--libc/config/gpu/entrypoints.txt1
-rw-r--r--libc/config/linux/x86_64/entrypoints.txt1
-rw-r--r--libc/docs/math/index.rst2
-rw-r--r--libc/spec/stdc.td2
-rw-r--r--libc/src/math/CMakeLists.txt2
-rw-r--r--libc/src/math/exp10m1f16.h21
-rw-r--r--libc/src/math/generic/CMakeLists.txt23
-rw-r--r--libc/src/math/generic/exp10f16.cpp47
-rw-r--r--libc/src/math/generic/exp10m1f16.cpp163
-rw-r--r--libc/src/math/generic/expxf16.h47
-rw-r--r--libc/test/src/math/CMakeLists.txt11
-rw-r--r--libc/test/src/math/exp10m1f16_test.cpp40
-rw-r--r--libc/test/src/math/smoke/CMakeLists.txt13
-rw-r--r--libc/test/src/math/smoke/exp10m1f16_test.cpp113
-rw-r--r--libc/utils/MPFRWrapper/MPFRUtils.cpp25
-rw-r--r--libc/utils/MPFRWrapper/MPFRUtils.h1
-rw-r--r--libcxx/docs/Status/Cxx23Issues.csv2
-rw-r--r--libcxx/docs/Status/Cxx23Papers.csv2
-rw-r--r--libcxx/include/__ranges/zip_view.h55
-rw-r--r--libcxx/include/__split_buffer3
-rw-r--r--libcxx/include/future4
-rw-r--r--libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp4
-rw-r--r--libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp6
-rw-r--r--libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp16
-rw-r--r--libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp8
-rw-r--r--libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp14
-rw-r--r--libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp8
-rw-r--r--lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py2
-rw-r--r--lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp20
-rw-r--r--lldb/source/Utility/DiagnosticsRendering.cpp56
-rw-r--r--lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py1
-rw-r--r--lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py1
-rw-r--r--lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py1
-rw-r--r--lldb/test/API/commands/expression/top-level/Makefile2
-rw-r--r--lldb/test/API/commands/expression/weak_symbols/Makefile4
-rw-r--r--lldb/test/API/commands/target/create-deps/Makefile2
-rw-r--r--lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile2
-rw-r--r--lldb/test/API/functionalities/dlopen_other_executable/Makefile2
-rw-r--r--lldb/test/API/functionalities/exec/Makefile2
-rw-r--r--lldb/test/API/functionalities/jitloader_gdb/Makefile2
-rw-r--r--lldb/test/API/functionalities/limit-debug-info/Makefile4
-rw-r--r--lldb/test/API/functionalities/load_after_attach/Makefile2
-rw-r--r--lldb/test/API/functionalities/load_lazy/Makefile6
-rw-r--r--lldb/test/API/functionalities/load_unload/Makefile10
-rw-r--r--lldb/test/API/functionalities/load_using_paths/Makefile2
-rw-r--r--lldb/test/API/functionalities/scripted_process/Makefile2
-rw-r--r--lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile4
-rw-r--r--lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile2
-rw-r--r--lldb/test/API/functionalities/target-new-solib-notifications/Makefile20
-rw-r--r--lldb/test/API/lang/c/conflicting-symbol/Makefile2
-rw-r--r--lldb/test/API/lang/cpp/incomplete-types/Makefile2
-rw-r--r--lldb/test/API/lang/cpp/namespace_definitions/Makefile4
-rw-r--r--lldb/test/API/lang/cpp/stl/Makefile6
-rw-r--r--lldb/test/API/lang/objc/conflicting-definition/Makefile4
-rw-r--r--lldb/test/API/lang/objc/modules-hash-mismatch/Makefile2
-rw-r--r--lldb/test/API/macosx/delay-init-dependency/Makefile2
-rw-r--r--lldb/test/API/macosx/expedited-thread-pcs/Makefile2
-rw-r--r--lldb/test/API/macosx/indirect_symbol/Makefile4
-rw-r--r--lldb/test/API/macosx/lc-note/kern-ver-str/Makefile2
-rw-r--r--lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile4
-rw-r--r--lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile2
-rw-r--r--lldb/test/API/macosx/skinny-corefile/Makefile4
-rw-r--r--lldb/test/API/tools/lldb-dap/breakpoint/Makefile2
-rw-r--r--lldb/test/API/tools/lldb-dap/send-event/Makefile3
-rw-r--r--lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py67
-rw-r--r--lldb/test/API/tools/lldb-dap/send-event/main.c6
-rw-r--r--lldb/test/API/tools/lldb-server/libraries-svr4/Makefile4
-rw-r--r--lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp (renamed from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp)2
-rw-r--r--lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp (renamed from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp)2
-rw-r--r--lldb/tools/debugserver/source/RNBRemote.cpp16
-rw-r--r--lldb/tools/debugserver/source/libdebugserver.cpp8
-rw-r--r--lldb/tools/lldb-dap/DAP.cpp62
-rw-r--r--lldb/tools/lldb-dap/DAP.h5
-rw-r--r--lldb/tools/lldb-dap/README.md31
-rw-r--r--lldb/tools/lldb-dap/lldb-dap.cpp2
-rw-r--r--lldb/unittests/Utility/DiagnosticsRenderingTest.cpp55
-rw-r--r--llvm/include/llvm/CodeGen/BasicTTIImpl.h3
-rw-r--r--llvm/include/llvm/CodeGen/EarlyIfConversion.h24
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h37
-rw-r--r--llvm/include/llvm/CodeGen/ISDOpcodes.h3
-rw-r--r--llvm/include/llvm/CodeGen/MIRParser/MIParser.h2
-rw-r--r--llvm/include/llvm/CodeGen/MachineTraceMetrics.h65
-rw-r--r--llvm/include/llvm/CodeGen/Passes.h2
-rw-r--r--llvm/include/llvm/CodeGen/SDPatternMatch.h23
-rw-r--r--llvm/include/llvm/IR/ConstrainedOps.def1
-rw-r--r--llvm/include/llvm/IR/Intrinsics.h10
-rw-r--r--llvm/include/llvm/IR/Intrinsics.td5
-rw-r--r--llvm/include/llvm/IR/RuntimeLibcalls.def5
-rw-r--r--llvm/include/llvm/InitializePasses.h4
-rw-r--r--llvm/include/llvm/Passes/CodeGenPassBuilder.h1
-rw-r--r--llvm/include/llvm/Passes/MachinePassRegistry.def6
-rw-r--r--llvm/include/llvm/Target/TargetSelectionDAG.td6
-rw-r--r--llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h4
-rw-r--r--llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h1
-rw-r--r--llvm/lib/Analysis/LazyValueInfo.cpp2
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp12
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp9
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp11
-rw-r--r--llvm/lib/CodeGen/CodeGen.cpp2
-rw-r--r--llvm/lib/CodeGen/EarlyIfConversion.cpp85
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp12
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIRParser.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineCombiner.cpp8
-rw-r--r--llvm/lib/CodeGen/MachineTraceMetrics.cpp79
-rw-r--r--llvm/lib/CodeGen/SafeStack.cpp3
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp3
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp7
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp22
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp3
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp12
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp2
-rw-r--r--llvm/lib/CodeGen/StackProtector.cpp6
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp7
-rw-r--r--llvm/lib/CodeGen/TargetPassConfig.cpp4
-rw-r--r--llvm/lib/ExecutionEngine/Orc/LLJIT.cpp5
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp150
-rw-r--r--llvm/lib/IR/Intrinsics.cpp10
-rw-r--r--llvm/lib/IR/LegacyPassManager.cpp8
-rw-r--r--llvm/lib/IR/RuntimeLibcalls.cpp1
-rw-r--r--llvm/lib/LTO/LTO.cpp14
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp2
-rw-r--r--llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp64
-rw-r--r--llvm/lib/Support/FormatVariadic.cpp7
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td15
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp205
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td86
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td21
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td84
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td312
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp4
-rw-r--r--llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp31
-rw-r--r--llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp5
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp3
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp7
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrVSX.td4
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp21
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp14
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoC.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp91
-rw-r--r--llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp22
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp7
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp2
-rw-r--r--llvm/lib/Target/X86/X86WinEHState.cpp21
-rw-r--r--llvm/lib/Transforms/IPO/ExpandVariadics.cpp5
-rw-r--r--llvm/lib/Transforms/IPO/GlobalDCE.cpp6
-rw-r--r--llvm/lib/Transforms/IPO/GlobalSplit.cpp8
-rw-r--r--llvm/lib/Transforms/IPO/LowerTypeTests.cpp6
-rw-r--r--llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp10
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp13
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp8
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp51
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp8
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp20
-rw-r--r--llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp12
-rw-r--r--llvm/lib/Transforms/Instrumentation/KCFI.cpp3
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp36
-rw-r--r--llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/Float2Int.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/GuardWidening.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/IndVarSimplify.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LoopPredication.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp12
-rw-r--r--llvm/test/Analysis/CostModel/RISCV/arith-fp.ll56
-rw-r--r--llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll207
-rw-r--r--llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll805
-rw-r--r--llvm/test/Analysis/CostModel/RISCV/rvv-select.ll180
-rw-r--r--llvm/test/Analysis/CostModel/RISCV/splice.ll26
-rw-r--r--llvm/test/Assembler/fp-intrinsics-attr.ll8
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir100
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll35
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll129
-rw-r--r--llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir1
-rw-r--r--llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir1
-rw-r--r--llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir1
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll121
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll426
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir40
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir127
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir127
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir54
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir54
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir152
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir152
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir51
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir51
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll49
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir28
-rw-r--r--llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir32
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir23
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir35
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll61
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll68
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll61
-rw-r--r--llvm/test/CodeGen/AMDGPU/fpext.f16.ll916
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoi.i128.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptosi.f16.ll480
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoui.f16.ll477
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/itofp.i128.ll166
-rw-r--r--llvm/test/CodeGen/AMDGPU/sitofp.f16.ll371
-rw-r--r--llvm/test/CodeGen/AMDGPU/uitofp.f16.ll371
-rw-r--r--llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll35
-rw-r--r--llvm/test/CodeGen/NVPTX/lower-byval-args.ll19
-rw-r--r--llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir2
-rw-r--r--llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll84
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vector-splice.ll333
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll126
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vl-opt.ll50
-rw-r--r--llvm/test/CodeGen/Thumb2/avoidmuls.mir67
-rw-r--r--llvm/test/CodeGen/X86/andnot-patterns.ll626
-rw-r--r--llvm/test/CodeGen/X86/combine-sdiv.ll2
-rw-r--r--llvm/test/CodeGen/X86/fp-intrinsics.ll59
-rw-r--r--llvm/test/CodeGen/X86/fp128-libcalls-strict.ll45
-rw-r--r--llvm/test/CodeGen/X86/fp80-strict-libcalls.ll30
-rw-r--r--llvm/test/CodeGen/X86/llvm.atan2.ll80
-rw-r--r--llvm/test/CodeGen/X86/masked_store_trunc.ll2
-rw-r--r--llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll2
-rw-r--r--llvm/test/CodeGen/X86/masked_store_trunc_usat.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr108731.ll127
-rw-r--r--llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll4
-rw-r--r--llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll34
-rw-r--r--llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll258
-rw-r--r--llvm/test/Feature/fp-intrinsics.ll15
-rw-r--r--llvm/test/MC/AMDGPU/gfx11_asm_vop3.s208
-rw-r--r--llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s192
-rw-r--r--llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s124
-rw-r--r--llvm/test/MC/AMDGPU/gfx12_asm_vop3.s156
-rw-r--r--llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s328
-rw-r--r--llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s176
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt444
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt376
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt196
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt372
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt472
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt292
-rw-r--r--llvm/test/MC/RISCV/rv32c-valid.s3
-rw-r--r--llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll11
-rw-r--r--llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll13
-rw-r--r--llvm/test/Transforms/InstCombine/icmp-and-shift.ll13
-rw-r--r--llvm/test/Transforms/InstCombine/icmp-equality-test.ll16
-rw-r--r--llvm/test/Transforms/InstCombine/icmp.ll15
-rw-r--r--llvm/test/Transforms/InstCombine/select-cmp.ll89
-rw-r--r--llvm/test/Transforms/InstCombine/select-icmp-xor.ll190
-rw-r--r--llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll140
-rwxr-xr-xllvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32bin87781 -> 0 bytes
-rw-r--r--llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext4
-rw-r--r--llvm/test/tools/llvm-cov/binary-formats.c7
-rw-r--r--llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp22
-rw-r--r--llvm/unittests/Support/FormatVariadicTest.cpp2
-rw-r--r--llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp68
-rw-r--r--llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn14
-rw-r--r--llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn2
-rwxr-xr-xllvm/utils/lit/lit/reports.py14
-rw-r--r--llvm/utils/lit/tests/shtest-format.py2
-rw-r--r--llvm/utils/lit/tests/xunit-output.py2
-rwxr-xr-xllvm/utils/revert_checker.py139
-rwxr-xr-xllvm/utils/revert_checker_test.py23
-rw-r--r--mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td1
-rw-r--r--mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td2
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td2
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td44
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td25
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td18
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td2
-rw-r--r--mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td4
-rw-r--r--mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h2
-rw-r--r--mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td32
-rw-r--r--mlir/include/mlir/Interfaces/InferTypeOpInterface.h4
-rw-r--r--mlir/include/mlir/Target/LLVMIR/ModuleImport.h2
-rw-r--r--mlir/lib/Analysis/FlatLinearValueConstraints.cpp4
-rw-r--r--mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp4
-rw-r--r--mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp1
-rw-r--r--mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp96
-rw-r--r--mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp10
-rw-r--r--mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp5
-rw-r--r--mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp17
-rw-r--r--mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp2
-rw-r--r--mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp8
-rw-r--r--mlir/lib/Dialect/Tensor/Utils/Utils.cpp6
-rw-r--r--mlir/lib/Interfaces/InferTypeOpInterface.cpp14
-rw-r--r--mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp6
-rw-r--r--mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp16
-rw-r--r--mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp6
-rw-r--r--mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp435
-rw-r--r--mlir/lib/Target/LLVMIR/ModuleImport.cpp32
-rw-r--r--mlir/lib/Target/LLVMIR/ModuleTranslation.cpp37
-rw-r--r--mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir2
-rw-r--r--mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir4
-rw-r--r--mlir/test/Dialect/LLVMIR/inlining.mlir4
-rw-r--r--mlir/test/Dialect/LLVMIR/roundtrip.mlir27
-rw-r--r--mlir/test/Dialect/MemRef/emulate-narrow-type.mlir21
-rw-r--r--mlir/test/Dialect/MemRef/emulate-wide-int.mlir35
-rw-r--r--mlir/test/Target/LLVMIR/Import/intrinsic.ll12
-rw-r--r--mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir15
-rw-r--r--mlir/test/Target/LLVMIR/llvmir-invalid.mlir2
-rw-r--r--mlir/test/Target/LLVMIR/openmp-firstprivate.mlir25
-rw-r--r--mlir/test/Target/LLVMIR/openmp-private.mlir6
-rw-r--r--mlir/test/mlir-tblgen/op-decl-and-defs.td5
-rw-r--r--mlir/tools/mlir-tblgen/OmpOpGen.cpp4
-rw-r--r--mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp3
-rw-r--r--polly/lib/CodeGen/BlockGenerators.cpp6
-rw-r--r--polly/lib/CodeGen/IslNodeBuilder.cpp2
-rw-r--r--polly/lib/CodeGen/LoopGeneratorsGOMP.cpp2
-rw-r--r--utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel20
417 files changed, 15319 insertions, 4977 deletions
diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index 62f9d19b..c4cac7d 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -458,7 +458,6 @@ ClangTidyASTConsumerFactory::createASTConsumer(
if (!AnalyzerOptions.CheckersAndPackages.empty()) {
setStaticAnalyzerCheckerOpts(Context.getOptions(), AnalyzerOptions);
AnalyzerOptions.AnalysisDiagOpt = PD_NONE;
- AnalyzerOptions.eagerlyAssumeBinOpBifurcation = true;
std::unique_ptr<ento::AnalysisASTConsumer> AnalysisConsumer =
ento::CreateAnalysisConsumer(Compiler);
AnalysisConsumer->AddDiagnosticConsumer(
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 817e3ab..dc5564b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -99,6 +99,19 @@ C++ Specific Potentially Breaking Changes
// Was error, now evaluates to false.
constexpr bool b = f() == g();
+- Clang will now correctly not consider pointers to non classes for covariance.
+
+ .. code-block:: c++
+
+ struct A {
+ virtual const int *f() const;
+ };
+ struct B : A {
+ // Return type has less cv-qualification but doesn't point to a class.
+ // Error will be generated.
+ int *f() const override;
+ };
+
- The warning ``-Wdeprecated-literal-operator`` is now on by default, as this is
something that WG21 has shown interest in removing from the language. The
result is that anyone who is compiling with ``-Werror`` should see this
@@ -605,6 +618,8 @@ Android Support
Windows Support
^^^^^^^^^^^^^^^
+- clang-cl now supports ``/std:c++23preview`` which enables C++23 features.
+
- Clang no longer allows references inside a union when emulating MSVC 1900+ even if `fms-extensions` is enabled.
Starting with VS2015, MSVC 1900, this Microsoft extension is no longer allowed and always results in an error.
Clang now follows the MSVC behavior in this scenario.
diff --git a/clang/docs/SafeBuffers.rst b/clang/docs/SafeBuffers.rst
new file mode 100644
index 0000000..144c3a7
--- /dev/null
+++ b/clang/docs/SafeBuffers.rst
@@ -0,0 +1,585 @@
+================
+C++ Safe Buffers
+================
+
+.. contents::
+ :local:
+
+
+Introduction
+============
+
+Clang can be used to harden your C++ code against buffer overflows, an otherwise
+common security issue with C-based languages.
+
+The solution described in this document is an integrated programming model as
+it combines:
+
+- a family of opt-in Clang warnings (``-Wunsafe-buffer-usage``) emitted at
+ during compilation to help you update your code to encapsulate and propagate
+ the bounds information associated with pointers;
+- runtime assertions implemented as part of
+ (`libc++ hardening modes <https://libcxx.llvm.org/Hardening.html>`_)
+ that eliminate undefined behavior as long as the coding convention
+ is followed and the bounds information is therefore available and correct.
+
+The goal of this work is to enable development of bounds-safe C++ code. It is
+not a "push-button" solution; depending on your codebase's existing
+coding style, significant (even if largely mechanical) changes to your code
+may be necessary. However, it allows you to achieve valuable safety guarantees
+on security-critical parts of your codebase.
+
+This solution is under active development. It is already useful for its purpose
+but more work is being done to improve ergonomics and safety guarantees
+and reduce adoption costs.
+
+The solution aligns in spirit with the "Ranges" safety profile
+that was `proposed <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3274r0.pdf>`_
+by Bjarne Stroustrup for standardization alongside other C++ safety features.
+
+
+Pre-Requisites
+==============
+
+In order to achieve bounds safety, your codebase needs to have access to
+well-encapsulated bounds-safe container, view, and iterator types.
+If your project uses libc++, standard container and view types such as
+``std::vector`` and ``std::span`` can be made bounds-safe by enabling
+the "fast" `hardening mode <https://libcxx.llvm.org/Hardening.html>`_
+(passing ``-D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST``) to your
+compiler) or any of the stricter hardening modes.
+
+In order to harden iterators, you'll need to also obtain a libc++ binary
+built with ``_LIBCPP_ABI_BOUNDED_ITERATORS`` -- which is a libc++ ABI setting
+that needs to be set for your entire target platform if you need to maintain
+binary compatibility with the rest of the platform.
+
+A relatively fresh version of C++ is recommended. In particular, the very useful
+standard view class ``std::span`` requires C++20.
+
+Other implementations of the C++ standard library may provide different
+flags to enable such hardening hardening.
+
+If you're using custom containers and views, they will need to be hardened
+this way as well, but you don't necessarily need to do this ahead of time.
+
+This approach can theoretically be applied to plain C codebases,
+assuming that safe primitives are developed to encapsulate all buffer accesses,
+acting as "hardened custom containers" to replace raw pointers.
+However, such approach would be very unergonomic in C, and safety guarantees
+will be lower due to lack of good encapsulation technology. A better approach
+to bounds safety for non-C++ programs,
+`-fbounds-safety <https://clang.llvm.org/docs/BoundsSafety.html>`_,
+is currently in development.
+
+Technically, safety guarantees cannot be provided without hardening
+the entire technology stack, including all of your dependencies.
+However, applying such hardening technology to even a small portion
+of your code may be significantly better than nothing.
+
+
+The Programming Model for C++
+=============================
+
+Assuming that hardened container, view, and iterator classes are available,
+what remains is to make sure they are used consistently in your code.
+Below we define the specific coding convention that needs to be followed
+in order to guarantee safety and how the compiler technology
+around ``-Wunsafe-buffer-usage`` assists with that.
+
+
+Buffer operations should never be performed over raw pointers
+-------------------------------------------------------------
+
+Every time a memory access is made, a bounds-safe program must guarantee
+that the range of accessed memory addresses falls into the boundaries
+of the memory allocated for the object that's being accessed.
+In order to establish such a guarantee, the information about such valid range
+of addresses -- the **bounds information** associated with the accessed address
+-- must be formally available every time a memory access is performed.
+
+A raw pointer does not naturally carry any bounds information.
+The bounds information for the pointer may be available *somewhere*, but
+it is not associated with the pointer in a formal manner, so a memory access
+performed through a raw pointer cannot be automatically verified to be
+bounds-safe by the compiler.
+
+That said, the Safe Buffers programming model does **not** try to eliminate
+**all** pointer usage. Instead it assumes that most pointers point to
+individual objects, not buffers, and therefore they typically aren't
+associated with buffer overflow risks. For that reason, in order to identify
+the code that requires manual intervention, it is desirable to initially shift
+the focus away from the pointers themselves, and instead focus on their
+**usage patterns**.
+
+The compiler warning ``-Wunsafe-buffer-usage`` is built to assist you
+with this step of the process. A ``-Wunsafe-buffer-usage`` warning is
+emitted whenever one of the following **buffer operations** are performed
+on a raw pointer:
+
+- array indexing with ``[]``,
+- pointer arithmetic,
+- bounds-unsafe standard C functions such as ``std::memcpy()``,
+- C++ smart pointer operations such as ``std::unique_ptr<T[N]>::operator[]()``,
+ which unfortunately cannot be made fully safe within the rules of
+ the C++ standard (as of C++23).
+
+This is sufficient for identifying each raw buffer pointer in the program at
+**at least one point** during its lifetime across your software stack.
+
+For example, both of the following functions are flagged by
+``-Wunsafe-buffer-usage`` because ``pointer`` gets identified as an unsafe
+buffer pointer. Even though the second function does not directly access
+the buffer, the pointer arithmetic operation inside it may easily be
+the only formal "hint" in the program that the pointer does indeed point
+to a buffer of multiple objects::
+
+ int get_last_element(int *pointer, size_t size) {
+ return ptr[sz - 1]; // warning: unsafe buffer access
+ }
+
+ int *get_last_element_ptr(int *pointer, size_t size) {
+ return ptr + (size - 1); // warning: unsafe pointer arithmetic
+ }
+
+
+All buffers need to be encapsulated into safe container and view types
+----------------------------------------------------------------------
+
+It immediately follows from the previous requirement that once an unsafe pointer
+is identified at any point during its lifetime, it should be immediately wrapped
+into a safe container type (if the allocation site is "nearby") or a safe
+view type (if the allocation site is "far away"). Not only memory accesses,
+but also non-access operations such as pointer arithmetic need to be covered
+this way in order to benefit from the respective runtime bounds checks.
+
+If a **container** type (``std::array``, ``std::vector``, ``std::string``)
+is used for allocating the buffer, this is the best-case scenario because
+the container naturally has access to the correct bounds information for the
+buffer, and the runtime bounds checks immediately kick in. Additionally,
+the container type may provide automatic lifetime management for the buffer
+(which may or may not be desirable).
+
+If a **view** type is used (``std::span``, ``std::string_view``), this typically
+means that the bounds information for the "adopted" pointer needs to be passed
+to the view's constructor manually. This makes runtime checks immediately
+kick in with respect to the provided bounds information, which is an immediate
+improvement over the raw pointer. However, this situation is still fundamentally
+insufficient for security purposes, because **bounds information provided
+this way cannot be guaranteed to be correct**.
+
+For example, the function ``get_last_element()`` we've seen in the previous
+section can be made **slightly** safer this way::
+
+ int get_last_element(int *pointer, size_t size) {
+ std::span<int> sp(pointer, size);
+ return sp[size - 1]; // warning addressed
+ }
+
+Here ``std::span`` eliminates the potential concern that the operation
+``size - 1`` may overflow when ``sz`` is equal to ``0``, leading to a buffer
+"underrun". However, such program does not provide a guarantee that
+the variable ``sz`` correctly represents the **actual** size fo the buffer
+pointed to by ``ptr``. The ``std::span`` constructed this way may be ill-formed.
+It may fail to protect you from overrunning the original buffer.
+
+The following example demonstrates one of the most dangerous anti-patterns
+of this nature::
+
+ void convert_data(int *source_buf, size_t source_size,
+ int *target_buf, size_t target_size) {
+ // Terrible: mismatched pointer / size.
+ std::span<int> target_span(target_buf, source_size);
+ // ...
+ }
+
+The second parameter of ``std::span`` should never be the **desired** size
+of the buffer. It should always be the **actual** size of the buffer.
+Such code often indicates that the original code has already contained
+a vulnerability -- and the use of a safe view class failed to prevent it.
+
+If ``target_span`` actually needs to be of size ``source_size``, a significantly
+safer way to produce such a span would be to build it with the correct size
+first, and then resize it to the desired size by calling ``.first()``::
+
+ void convert_data(int *source_buf, size_t source_size,
+ int *target_buf, size_t target_size) {
+ // Safer.
+ std::span<int> target_span(target_buf, target_size).first(source_size);
+ // ...
+ }
+
+However, these are still half-measures. This code still accepts the
+bounds information from the caller in an **informal** manner, and such bounds
+information cannot be guaranteed to be correct.
+
+In order to mitigate problems of this nature in their entirety,
+the third guideline is imposed.
+
+
+Encapsulation of bounds information must be respected continuously
+------------------------------------------------------------------
+
+The allocation site of the object is the only reliable source of bounds
+information for that object. For objects with long lifespans across
+multiple functions or even libraries in the software stack, it is essential
+to formally preserve the original bounds information as it's being passed
+from one piece of code to another.
+
+Standard container and view classes are designed to preserve bounds information
+correctly **by construction**. However, they offer a number of ways to "break"
+encapsulation, which may cause you to temporarily lose track of the correct
+bounds information:
+
+- The two-parameter constructor ``std::span(ptr, size)`` allows you to
+ assemble an ill-formed ``std::span``;
+- Conversely, you can unwrap a container or a view object into a raw pointer
+ and a raw size by calling its ``.data()`` and ``.size()`` methods.
+- The overloaded ``operator&()`` found on container and iterator classes
+ acts similarly to ``.data()`` in this regard; operations such as
+ ``&span[0]`` and ``&*span.begin()`` are effectively unsafe.
+
+Additional ``-Wunsafe-buffer-usage`` warnings are emitted when encapsulation
+of **standard** containers is broken in this manner. If you're using
+non-standard containers, you can achieve a similar effect with facilities
+described in the next section: :ref:`customization`.
+
+For example, our previous attempt to address the warning in
+``get_last_element()`` has actually introduced a new warning along the way,
+that notifies you about the potentially incorrect bounds information
+passed into the two-parameter constructor of ``std::span``::
+
+ int get_last_element(int *pointer, size_t size) {
+ std::span<int> sp(pointer, size); // warning: unsafe constructor
+ return sp[size - 1];
+ }
+
+In order to address this warning, you need to make the function receive
+the bounds information from the allocation site in a formal manner.
+The function doesn't necessarily need to know where the allocation site is;
+it simply needs to be able to accept bounds information **when** it's available.
+You can achieve this by refactoring the function to accept a ``std::span``
+as a parameter::
+
+ int get_last_element(std::span<int> sp) {
+ return sp[size - 1];
+ }
+
+This solution puts the responsibility for making sure the span is well-formed
+on the **caller**. They should do the same, so that eventually the
+responsibility is placed on the allocation site!
+
+Such definition is also very ergonomic as it naturally accepts arbitrary
+standard containers without any additional code at the call site::
+
+ void use_last_element() {
+ std::vector<int> vec { 1, 2, 3 };
+ int x = get_last_element(vec); // x = 3
+ }
+
+Such code is naturally bounds-safe because bounds-information is passed down
+from the allocation site to the buffer access site. Only safe operations
+are performed on container types. The containers are never "unforged" into
+raw pointer-size pairs and never "reforged" again. This is what ideal
+bounds-safe C++ code looks like.
+
+
+.. _customization:
+
+Backwards Compatibility, Interoperation with Unsafe Code, Customization
+=======================================================================
+
+Some of the code changes described above can be somewhat intrusive.
+For example, changing a function that previously accepted a pointer and a size
+separately, to accept a ``std::span`` instead, may require you to update
+every call site of the function. This is often undesirable and sometimes
+completely unacceptable when backwards compatibility is required.
+
+In order to facilitate **incremental adoption** of the coding convention
+described above, as well as to handle various unusual situations, the compiler
+provides two additional facilities to give the user more control over
+``-Wunsafe-buffer-usage`` diagnostics:
+
+- ``#pragma clang unsafe_buffer_usage`` to mark code as unsafe and **suppress**
+ ``-Wunsafe-buffer-usage`` warnings in that code.
+- ``[[clang::unsafe_buffer_usage]]`` to annotate potential sources of
+ discontinuity of bounds information -- thus introducing
+ **additional** ``-Wunsafe-buffer-usage`` warnings.
+
+In this section we describe these facilities in detail and show how they can
+help you with various unusual situations.
+
+Suppress unwanted warnings with ``#pragma clang unsafe_buffer_usage``
+---------------------------------------------------------------------
+
+If you really need to write unsafe code, you can always suppress all
+``-Wunsafe-buffer-usage`` warnings in a section of code by surrounding
+that code with the ``unsafe_buffer_usage`` pragma. For example, if you don't
+want to address the warning in our example function ``get_last_element()``,
+here is how you can suppress it::
+
+ int get_last_element(int *pointer, size_t size) {
+ #pragma clang unsafe_buffer_usage begin
+ return ptr[sz - 1]; // warning suppressed
+ #pragma clang unsafe_buffer_usage end
+ }
+
+This behavior is analogous to ``#pragma clang diagnostic`` (`documentation
+<https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas>`_)
+However, ``#pragma clang unsafe_buffer_usage`` is specialized and recommended
+over ``#pragma clang diagnostic`` for a number of technical and non-technical
+reasons. Most importantly, ``#pragma clang unsafe_buffer_usage`` is more
+suitable for security audits because it is significantly simpler and
+describes unsafe code in a more formal manner. On the contrary,
+``#pragma clang diagnostic`` comes with a push/pop syntax (as opposed to
+the begin/end syntax) and it offers ways to suppress warnings without
+mentioning them by name (such as ``-Weverything``), which can make it
+difficult to determine at a glance whether the warning is suppressed
+on any given line of code.
+
+There are a few natural reasons to use this pragma:
+
+- In implementations of safe custom containers. You need this because ultimately
+ ``-Wunsafe-buffer-usage`` cannot help you verify that your custom container
+ is safe. It will naturally remind you to audit your container's implementation
+ to make sure it has all the necessary runtime checks, but ultimately you'll
+ need to suppress it once the audit is complete.
+- In performance-critical code where bounds-safety-related runtime checks
+ cause an unacceptable performance regression. The compiler can theoretically
+ optimize them away (eg. replace a repeated bounds check in a loop with
+ a single check before the loop) but it is not guaranteed to do that.
+- For incremental adoption purposes. If you want to adopt the coding convention
+ gradually, you can always surround an entire file with the
+ ``unsafe_buffer_usage`` pragma and then "make holes" in it whenever
+ you address warnings on specific portions of the code.
+- In the code that interoperates with unsafe code. This may be code that
+ will never follow the programming model (such as plain C code that will
+ never be converted to C++) or with the code that simply haven't been converted
+ yet.
+
+Interoperation with unsafe code may require a lot of suppressions.
+You are encouraged to introduce "unsafe wrapper functions" for various unsafe
+operations that you need to perform regularly.
+
+For example, if you regularly receive pointer/size pairs from unsafe code,
+you may want to introduce a wrapper function for the unsafe span constructor::
+
+ #pragma clang unsafe_buffer_usage begin
+
+ template <typename T>
+ std::span<T> unsafe_forge_span(T *pointer, size_t size) {
+ return std::span(pointer, size);
+ }
+
+ #pragma clang unsafe_buffer_usage end
+
+Such wrapper function can be used to suppress warnings about unsafe span
+constructor usage in a more ergonomic manner::
+
+ void use_unsafe_c_struct(unsafe_c_struct *s) {
+ // No warning here.
+ std::span<int> sp = unsafe_forge_span(s->pointer, s->size);
+ // ...
+ }
+
+The code remains unsafe but it also continues to be nicely readable, and it
+proves that ``-Wunsafe-buffer-usage`` has done it best to notify you about
+the potential unsafety. A security auditor will need to keep an eye on such
+unsafe wrappers. **It is still up to you to confirm that the bounds information
+passed into the wrapper is correct.**
+
+
+Flag bounds information discontinuities with ``[[clang::unsafe_buffer_usage]]``
+-------------------------------------------------------------------------------
+
+The clang attribute ``[[clang::unsafe_buffer_usage]]``
+(`attribute documentation
+<https://clang.llvm.org/docs/AttributeReference.html#unsafe-buffer-usage>`_)
+allows the user to annotate various objects, such as functions or member
+variables, as incompatible with the Safe Buffers programming model.
+You are encouraged to do that for arbitrary reasons, but typically the main
+reason to do that is when an unsafe function needs to be provided for
+backwards compatibility.
+
+For example, in the previous section we've seen how the example function
+``get_last_element()`` needed to have its parameter types changed in order
+to preserve the continuity of bounds information when receiving a buffer pointer
+from the caller. However, such a change breaks both API and ABI compatibility.
+The code that previously used this function will no longer compile, nor link,
+until every call site of that function is updated. You can reclaim the
+backwards compatibility -- in terms of both API and ABI -- by adding
+a "compatibility overload"::
+
+ int get_last_element(std::span<int> sp) {
+ return sp[size - 1];
+ }
+
+ [[clang::unsafe_buffer_usage]] // Please use the new function.
+ int get_last_element(int *pointer, size_t size) {
+ // Avoid code duplication - simply invoke the safe function!
+ // The pragma suppresses the unsafe constructor warning.
+ #pragma clang unsafe_buffer_usage begin
+ return get_last_element(std::span(pointer, size));
+ #pragma clang unsafe_buffer_usage end
+ }
+
+
+Such an overload allows the surrounding code to continue to work.
+It is both source-compatible and binary-compatible. It is also strictly safer
+than the original function because the unsafe buffer access through raw pointer
+is replaced with a safe ``std::span`` access no matter how it's called. However,
+because it requires the caller to pass the pointer and the size separately,
+it violates our "bounds information continuity" principle. This means that
+the callers who care about bounds safety needs to be encouraged to use the
+``std::span``-based overload instead. Luckily, the attribute
+``[[clang::unsafe_buffer_usage]]`` causes a ``-Wunsafe-buffer-usage`` warning
+to be displayed at every call site of the compatibility overload in order to
+remind the callers to update their code::
+
+ void use_last_element() {
+ std::vector<int> vec { 1, 2, 3 };
+
+ // no warning
+ int x = get_last_element(vec);
+
+ // warning: this overload introduces unsafe buffer manipulation
+ int x = get_last_element(vec.data(), vec.size());
+ }
+
+The compatibility overload can be further simplified with the help of the
+``unsafe_forge_span()`` wrapper as described in the previous section --
+and it even makes the pragmas unnecessary::
+
+ [[clang::unsafe_buffer_usage]] // Please use the new function.
+ int get_last_element(int *pointer, size_t size) {
+ // Avoid code duplication - simply invoke the safe function!
+ return get_last_element(unsafe_forge_span(pointer, size));
+ }
+
+Notice how the attribute ``[[clang::unsafe_buffer_usage]]`` does **not**
+suppress the warnings within the function on its own. Similarly, functions whose
+entire definitions are covered by ``#pragma clang unsafe_buffer_usage`` do
+**not** become automatically annotated with the attribute
+``[[clang::unsafe_buffer_usage]]``. They serve two different purposes:
+
+- The pragma says that the function isn't safely **written**;
+- The attribute says that the function isn't safe to **use**.
+
+Also notice how we've made an **unsafe** wrapper for a **safe** function.
+This is significantly better than making a **safe** wrapper for an **unsafe**
+function. In other words, the following solution is significantly more unsafe
+and undesirable than the previous solution::
+
+ int get_last_element(std::span<int> sp) {
+ // You've just added that attribute, and now you need to
+ // immediately suppress the warning that comes with it?
+ #pragma clang unsafe_buffer_usage begin
+ return get_last_element(sp.data(), sp.size());
+ #pragma clang unsafe_buffer_usage end
+ }
+
+
+ [[clang::unsafe_buffer_usage]]
+ int get_last_element(int *pointer, size_t size) {
+ // This access is still completely unchecked. What's the point of having
+ // perfect bounds information if you aren't performing runtime checks?
+ #pragma clang unsafe_buffer_usage begin
+ return ptr[sz - 1];
+ #pragma clang unsafe_buffer_usage end
+ }
+
+**Structs and classes**, unlike functions, cannot be overloaded. If a struct
+contains an unsafe buffer (in the form of a nested array or a pointer/size pair)
+then it is typically impossible to replace them with a safe container (such as
+``std::array`` or ``std::span`` respectively) without breaking the layout
+of the struct and introducing both source and binary incompatibilities with
+the surrounding client code.
+
+Additionally, member variables of a class cannot be naturally "hidden" from
+client code. If a class needs to be used by clients who haven't updated to
+C++20 yet, you cannot use the C++20-specific ``std::span`` as a member variable
+type. If the definition of a struct is shared with plain C code that manipulates
+member variables directly, you cannot use any C++-specific types for these
+member variables.
+
+In such cases there's usually no backwards-compatible way to use safe types
+directly. The best option is usually to discourage the clients from using
+member variables directly by annotating the member variables with the attribute
+``[[clang::unsafe_buffer_usage]]``, and then to change the interface
+of the class to provide safe "accessors" to the unsafe data.
+
+For example, let's assume the worst-case scenario: ``struct foo`` is an unsafe
+struct type fully defined in a header shared between plain C code and C++ code::
+
+ struct foo {
+ int *pointer;
+ size_t size;
+ };
+
+In this case you can achieve safety in C++ code by annotating the member
+variables as unsafe and encapsulating them into safe accessor methods::
+
+ struct foo {
+ [[clang::unsafe_buffer_usage]]
+ int *pointer;
+ [[clang::unsafe_buffer_usage]]
+ size_t size;
+
+ // Avoid showing this code to clients who are unable to digest it.
+ #if __cplusplus >= 202002L
+ std::span<int> get_pointer_as_span() {
+ #pragma clang unsafe_buffer_usage begin
+ return std::span(pointer, size);
+ #pragma clang unsafe_buffer_usage end
+ }
+
+ void set_pointer_from_span(std::span<int> sp) {
+ #pragma clang unsafe_buffer_usage begin
+ pointer = sp.data();
+ size = sp.size();
+ #pragma clang unsafe_buffer_usage end
+ }
+
+ // Potentially more utility functions.
+ #endif
+ };
+
+Future Work
+===========
+
+The ``-Wunsafe-buffer-usage`` technology is in active development. The warning
+is largely ready for everyday use but it is continuously improved to reduce
+unnecessary noise as well as cover some of the trickier unsafe operations.
+
+Fix-It Hints for ``-Wunsafe-buffer-usage``
+------------------------------------------
+
+A code transformation tool is in development that can semi-automatically
+transform large bodies of code to follow the C++ Safe Buffers programming model.
+It can currently be accessed by passing the experimental flag
+``-fsafe-buffer-usage-suggestions`` in addition to ``-Wunsafe-buffer-usage``.
+
+Fixits produced this way currently assume the default approach described
+in this document as they suggest standard containers and views (most notably
+``std::span`` and ``std::array``) as replacements for raw buffer pointers.
+This also additionally requires libc++ hardening in order to make the runtime
+bounds checks actually happen.
+
+Static Analysis to Identify Suspicious Sources of Bounds Information
+--------------------------------------------------------------------
+
+The unsafe constructor ``span(pointer, size)`` is often a necessary evil
+when it comes to interoperation with unsafe code. However, passing the
+correct bounds information to such constructor is often difficult.
+In order to detect those ``span(target_pointer, source_size)`` anti-patterns,
+path-sensitive analysis performed by `the clang static analyzer
+<https://clang-analyzer.llvm.org>`_ can be taught to identify situations
+when the pointer and the size are coming from "suspiciously different" sources.
+
+Such analysis will be able to identify the source of information with
+significantly higher precision than that of the compiler, making it much better
+at identifying incorrect bounds information in your code while producing
+significantly fewer warnings. It will also need to bypass
+``#pragma clang unsafe_buffer_usage`` suppressions and "see through"
+unsafe wrappers such as ``unsafe_forge_span`` -- something that
+the static analyzer is naturally capable of doing.
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 8126442..58dbd68 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -3371,12 +3371,23 @@ Checks for overlap in two buffer arguments. Applies to: ``memcpy, mempcpy, wmem
alpha.unix.cstring.NotNullTerminated (C)
""""""""""""""""""""""""""""""""""""""""
-Check for arguments which are not null-terminated strings; applies to: ``strlen, strnlen, strcpy, strncpy, strcat, strncat, wcslen, wcsnlen``.
+Check for arguments which are not null-terminated strings;
+applies to the ``strlen``, ``strcpy``, ``strcat``, ``strcmp`` family of functions.
+
+Only very fundamental cases are detected where the passed memory block is
+absolutely different from a null-terminated string. This checker does not
+find if a memory buffer is passed where the terminating zero character
+is missing.
.. code-block:: c
- void test() {
- int y = strlen((char *)&test); // warn
+ void test1() {
+ int l = strlen((char *)&test); // warn
+ }
+
+ void test2() {
+ label:
+ int l = strlen((char *)&&label); // warn
}
.. _alpha-unix-cstring-OutOfBounds:
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index f4fdc93..0f6fb36 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -25,6 +25,7 @@ Using Clang as a Compiler
CrossCompilation
ClangStaticAnalyzer
ThreadSafetyAnalysis
+ SafeBuffers
DataFlowAnalysisIntro
AddressSanitizer
ThreadSanitizer
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 141f58c..0f0c0bf 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -2085,7 +2085,11 @@ public:
class ClassTemplatePartialSpecializationDecl
: public ClassTemplateSpecializationDecl {
/// The list of template parameters
- TemplateParameterList* TemplateParams = nullptr;
+ TemplateParameterList *TemplateParams = nullptr;
+
+ /// The set of "injected" template arguments used within this
+ /// partial specialization.
+ TemplateArgument *InjectedArgs = nullptr;
/// The class template partial specialization from which this
/// class template partial specialization was instantiated.
@@ -2132,6 +2136,10 @@ public:
return TemplateParams;
}
+ /// Retrieve the template arguments list of the template parameter list
+ /// of this template.
+ ArrayRef<TemplateArgument> getInjectedTemplateArgs();
+
/// \brief All associated constraints of this partial specialization,
/// including the requires clause and any constraints derived from
/// constrained-parameters.
@@ -2856,6 +2864,10 @@ class VarTemplatePartialSpecializationDecl
/// The list of template parameters
TemplateParameterList *TemplateParams = nullptr;
+ /// The set of "injected" template arguments used within this
+ /// partial specialization.
+ TemplateArgument *InjectedArgs = nullptr;
+
/// The variable template partial specialization from which this
/// variable template partial specialization was instantiated.
///
@@ -2902,6 +2914,10 @@ public:
return TemplateParams;
}
+ /// Retrieve the template arguments list of the template parameter list
+ /// of this template.
+ ArrayRef<TemplateArgument> getInjectedTemplateArgs();
+
/// \brief All associated constraints of this partial specialization,
/// including the requires clause and any constraints derived from
/// constrained-parameters.
diff --git a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
new file mode 100644
index 0000000..3402d10
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
@@ -0,0 +1,217 @@
+//===-- CachedConstAccessorsLattice.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the lattice mixin that additionally maintains a cache of
+// stable method call return values to model const accessor member functions.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
+#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
+
+#include "clang/AST/Expr.h"
+#include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
+#include "clang/Analysis/FlowSensitive/DataflowLattice.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Analysis/FlowSensitive/Value.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+
+namespace clang {
+namespace dataflow {
+
+/// A mixin for a lattice that additionally maintains a cache of stable method
+/// call return values to model const accessors methods. When a non-const method
+/// is called, the cache should be cleared causing the next call to a const
+/// method to be considered a different value. NOTE: The user is responsible for
+/// clearing the cache.
+///
+/// For example:
+///
+/// class Bar {
+/// public:
+/// const std::optional<Foo>& getFoo() const;
+/// void clear();
+/// };
+//
+/// void func(Bar& s) {
+/// if (s.getFoo().has_value()) {
+/// use(s.getFoo().value()); // safe (checked earlier getFoo())
+/// s.clear();
+/// use(s.getFoo().value()); // unsafe (invalidate cache for s)
+/// }
+/// }
+template <typename Base> class CachedConstAccessorsLattice : public Base {
+public:
+ using Base::Base; // inherit all constructors
+
+ /// Creates or returns a previously created `Value` associated with a const
+ /// method call `obj.getFoo()` where `RecordLoc` is the
+ /// `RecordStorageLocation` of `obj`.
+ /// Returns nullptr if unable to find or create a value.
+ ///
+ /// Requirements:
+ ///
+ /// - `CE` should return a value (not a reference or record type)
+ Value *
+ getOrCreateConstMethodReturnValue(const RecordStorageLocation &RecordLoc,
+ const CallExpr *CE, Environment &Env);
+
+ /// Creates or returns a previously created `StorageLocation` associated with
+ /// a const method call `obj.getFoo()` where `RecordLoc` is the
+ /// `RecordStorageLocation` of `obj`.
+ ///
+ /// The callback `Initialize` runs on the storage location if newly created.
+ /// Returns nullptr if unable to find or create a value.
+ ///
+ /// Requirements:
+ ///
+ /// - `CE` should return a location (GLValue or a record type).
+ StorageLocation *getOrCreateConstMethodReturnStorageLocation(
+ const RecordStorageLocation &RecordLoc, const CallExpr *CE,
+ Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize);
+
+ void clearConstMethodReturnValues(const RecordStorageLocation &RecordLoc) {
+ ConstMethodReturnValues.erase(&RecordLoc);
+ }
+
+ void clearConstMethodReturnStorageLocations(
+ const RecordStorageLocation &RecordLoc) {
+ ConstMethodReturnStorageLocations.erase(&RecordLoc);
+ }
+
+ bool operator==(const CachedConstAccessorsLattice &Other) const {
+ return Base::operator==(Other);
+ }
+
+ LatticeJoinEffect join(const CachedConstAccessorsLattice &Other);
+
+private:
+ // Maps a record storage location and const method to the value to return
+ // from that const method.
+ using ConstMethodReturnValuesType =
+ llvm::SmallDenseMap<const RecordStorageLocation *,
+ llvm::SmallDenseMap<const FunctionDecl *, Value *>>;
+ ConstMethodReturnValuesType ConstMethodReturnValues;
+
+ // Maps a record storage location and const method to the record storage
+ // location to return from that const method.
+ using ConstMethodReturnStorageLocationsType = llvm::SmallDenseMap<
+ const RecordStorageLocation *,
+ llvm::SmallDenseMap<const FunctionDecl *, StorageLocation *>>;
+ ConstMethodReturnStorageLocationsType ConstMethodReturnStorageLocations;
+};
+
+namespace internal {
+
+template <typename T>
+llvm::SmallDenseMap<const RecordStorageLocation *,
+ llvm::SmallDenseMap<const FunctionDecl *, T *>>
+joinConstMethodMap(
+ const llvm::SmallDenseMap<const RecordStorageLocation *,
+ llvm::SmallDenseMap<const FunctionDecl *, T *>>
+ &Map1,
+ const llvm::SmallDenseMap<const RecordStorageLocation *,
+ llvm::SmallDenseMap<const FunctionDecl *, T *>>
+ &Map2,
+ LatticeEffect &Effect) {
+ llvm::SmallDenseMap<const RecordStorageLocation *,
+ llvm::SmallDenseMap<const FunctionDecl *, T *>>
+ Result;
+ for (auto &[Loc, DeclToT] : Map1) {
+ auto It = Map2.find(Loc);
+ if (It == Map2.end()) {
+ Effect = LatticeJoinEffect::Changed;
+ continue;
+ }
+ const auto &OtherDeclToT = It->second;
+ auto &JoinedDeclToT = Result[Loc];
+ for (auto [Func, Var] : DeclToT) {
+ T *OtherVar = OtherDeclToT.lookup(Func);
+ if (OtherVar == nullptr || OtherVar != Var) {
+ Effect = LatticeJoinEffect::Changed;
+ continue;
+ }
+ JoinedDeclToT.insert({Func, Var});
+ }
+ }
+ return Result;
+}
+
+} // namespace internal
+
+template <typename Base>
+LatticeEffect CachedConstAccessorsLattice<Base>::join(
+ const CachedConstAccessorsLattice<Base> &Other) {
+
+ LatticeEffect Effect = Base::join(Other);
+
+ // For simplicity, we only retain values that are identical, but not ones that
+ // are non-identical but equivalent. This is likely to be sufficient in
+ // practice, and it reduces implementation complexity considerably.
+
+ ConstMethodReturnValues = internal::joinConstMethodMap<Value>(
+ ConstMethodReturnValues, Other.ConstMethodReturnValues, Effect);
+
+ ConstMethodReturnStorageLocations =
+ internal::joinConstMethodMap<StorageLocation>(
+ ConstMethodReturnStorageLocations,
+ Other.ConstMethodReturnStorageLocations, Effect);
+
+ return Effect;
+}
+
+template <typename Base>
+Value *CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnValue(
+ const RecordStorageLocation &RecordLoc, const CallExpr *CE,
+ Environment &Env) {
+ QualType Type = CE->getType();
+ assert(!Type.isNull());
+ assert(!Type->isReferenceType());
+ assert(!Type->isRecordType());
+
+ auto &ObjMap = ConstMethodReturnValues[&RecordLoc];
+ const FunctionDecl *DirectCallee = CE->getDirectCallee();
+ if (DirectCallee == nullptr)
+ return nullptr;
+ auto it = ObjMap.find(DirectCallee);
+ if (it != ObjMap.end())
+ return it->second;
+
+ Value *Val = Env.createValue(Type);
+ if (Val != nullptr)
+ ObjMap.insert({DirectCallee, Val});
+ return Val;
+}
+
+template <typename Base>
+StorageLocation *
+CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnStorageLocation(
+ const RecordStorageLocation &RecordLoc, const CallExpr *CE,
+ Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize) {
+ assert(!CE->getType().isNull());
+ assert(CE->isGLValue() || CE->getType()->isRecordType());
+ auto &ObjMap = ConstMethodReturnStorageLocations[&RecordLoc];
+ const FunctionDecl *DirectCallee = CE->getDirectCallee();
+ if (DirectCallee == nullptr)
+ return nullptr;
+ auto it = ObjMap.find(DirectCallee);
+ if (it != ObjMap.end())
+ return it->second;
+
+ StorageLocation &Loc =
+ Env.createStorageLocation(CE->getType().getNonReferenceType());
+ Initialize(Loc);
+
+ ObjMap.insert({DirectCallee, &Loc});
+ return &Loc;
+}
+
+} // namespace dataflow
+} // namespace clang
+
+#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
diff --git a/clang/include/clang/CodeGen/CodeGenABITypes.h b/clang/include/clang/CodeGen/CodeGenABITypes.h
index 9cbc5a8..836fdd7 100644
--- a/clang/include/clang/CodeGen/CodeGenABITypes.h
+++ b/clang/include/clang/CodeGen/CodeGenABITypes.h
@@ -75,11 +75,25 @@ const CGFunctionInfo &arrangeCXXMethodType(CodeGenModule &CGM,
const FunctionProtoType *FTP,
const CXXMethodDecl *MD);
-const CGFunctionInfo &arrangeFreeFunctionCall(CodeGenModule &CGM,
- CanQualType returnType,
- ArrayRef<CanQualType> argTypes,
- FunctionType::ExtInfo info,
- RequiredArgs args);
+const CGFunctionInfo &
+arrangeCXXMethodCall(CodeGenModule &CGM, CanQualType returnType,
+ ArrayRef<CanQualType> argTypes, FunctionType::ExtInfo info,
+ ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+ RequiredArgs args);
+
+const CGFunctionInfo &arrangeFreeFunctionCall(
+ CodeGenModule &CGM, CanQualType returnType, ArrayRef<CanQualType> argTypes,
+ FunctionType::ExtInfo info,
+ ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+ RequiredArgs args);
+
+// An overload with an empty `paramInfos`
+inline const CGFunctionInfo &
+arrangeFreeFunctionCall(CodeGenModule &CGM, CanQualType returnType,
+ ArrayRef<CanQualType> argTypes,
+ FunctionType::ExtInfo info, RequiredArgs args) {
+ return arrangeFreeFunctionCall(CGM, returnType, argTypes, info, {}, args);
+}
/// Returns the implicit arguments to add to a complete, non-delegating C++
/// constructor call.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2072ae4..379e75b 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8531,7 +8531,7 @@ def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">,
HelpText<"Set runtime encoding, supports only UTF-8">,
Alias<fexec_charset_EQ>;
def _SLASH_std : CLCompileJoined<"std:">,
- HelpText<"Set language version (c++14,c++17,c++20,c++latest,c11,c17)">;
+ HelpText<"Set language version (c++14,c++17,c++20,c++23preview,c++latest,c11,c17)">;
def _SLASH_U : CLJoinedOrSeparate<"U">, HelpText<"Undefine macro">,
MetaVarName<"<macro>">, Alias<U>;
def _SLASH_validate_charset : CLFlag<"validate-charset">,
diff --git a/clang/include/clang/ExtractAPI/API.h b/clang/include/clang/ExtractAPI/API.h
index 4f34fcc..c30e6fa 100644
--- a/clang/include/clang/ExtractAPI/API.h
+++ b/clang/include/clang/ExtractAPI/API.h
@@ -26,6 +26,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/TargetParser/Triple.h"
#include <cstddef>
#include <iterator>
@@ -615,7 +616,24 @@ struct TagRecord : APIRecord, RecordContext {
return classofKind(Record->getKind());
}
static bool classofKind(RecordKind K) {
- return K == RK_Struct || K == RK_Union || K == RK_Enum;
+ switch (K) {
+ case RK_Enum:
+ LLVM_FALLTHROUGH;
+ case RK_Struct:
+ LLVM_FALLTHROUGH;
+ case RK_Union:
+ LLVM_FALLTHROUGH;
+ case RK_CXXClass:
+ LLVM_FALLTHROUGH;
+ case RK_ClassTemplate:
+ LLVM_FALLTHROUGH;
+ case RK_ClassTemplateSpecialization:
+ LLVM_FALLTHROUGH;
+ case RK_ClassTemplatePartialSpecialization:
+ return true;
+ default:
+ return false;
+ }
}
bool IsEmbeddedInVarDeclarator;
@@ -684,7 +702,22 @@ struct RecordRecord : TagRecord {
return classofKind(Record->getKind());
}
static bool classofKind(RecordKind K) {
- return K == RK_Struct || K == RK_Union;
+ switch (K) {
+ case RK_Struct:
+ LLVM_FALLTHROUGH;
+ case RK_Union:
+ LLVM_FALLTHROUGH;
+ case RK_CXXClass:
+ LLVM_FALLTHROUGH;
+ case RK_ClassTemplate:
+ LLVM_FALLTHROUGH;
+ case RK_ClassTemplateSpecialization:
+ LLVM_FALLTHROUGH;
+ case RK_ClassTemplatePartialSpecialization:
+ return true;
+ default:
+ return false;
+ }
}
bool isAnonymousWithNoTypedef() { return Name.empty(); }
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
index 737bc8e..ad2dbff 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
@@ -299,13 +299,12 @@ ANALYZER_OPTION(
ANALYZER_OPTION(
bool, ShouldEagerlyAssume, "eagerly-assume",
- "Whether we should eagerly assume evaluations of conditionals, thus, "
- "bifurcating the path. This indicates how the engine should handle "
- "expressions such as: 'x = (y != 0)'. When this is true then the "
- "subexpression 'y != 0' will be eagerly assumed to be true or false, thus "
- "evaluating it to the integers 0 or 1 respectively. The upside is that "
- "this can increase analysis precision until we have a better way to lazily "
- "evaluate such logic. The downside is that it eagerly bifurcates paths.",
+ "If this is enabled (the default behavior), when the analyzer encounters "
+ "a comparison operator or logical negation, it immediately splits the "
+ "state to separate the case when the expression is true and the case when "
+ "it's false. The upside is that this can increase analysis precision until "
+ "we have a better way to lazily evaluate such logic; the downside is that "
+ "it eagerly bifurcates paths.",
true)
ANALYZER_OPTION(
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index 3a3c1a1..2f4cd27 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -229,8 +229,6 @@ public:
unsigned AnalyzerDisplayProgress : 1;
unsigned AnalyzerNoteAnalysisEntryPoints : 1;
- unsigned eagerlyAssumeBinOpBifurcation : 1;
-
unsigned TrimGraph : 1;
unsigned visualizeExplodedGraphWithGraphViz : 1;
unsigned UnoptimizedCFG : 1;
@@ -293,9 +291,9 @@ public:
ShowConfigOptionsList(false),
ShouldEmitErrorsOnInvalidConfigValue(false), AnalyzeAll(false),
AnalyzerDisplayProgress(false), AnalyzerNoteAnalysisEntryPoints(false),
- eagerlyAssumeBinOpBifurcation(false), TrimGraph(false),
- visualizeExplodedGraphWithGraphViz(false), UnoptimizedCFG(false),
- PrintStats(false), NoRetryExhausted(false), AnalyzerWerror(false) {}
+ TrimGraph(false), visualizeExplodedGraphWithGraphViz(false),
+ UnoptimizedCFG(false), PrintStats(false), NoRetryExhausted(false),
+ AnalyzerWerror(false) {}
/// Interprets an option's string value as a boolean. The "true" string is
/// interpreted as true and the "false" string is interpreted as false.
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 04eacd1..8c7493e 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -583,14 +583,13 @@ public:
ExplodedNode *Pred,
ExplodedNodeSet &Dst);
- /// evalEagerlyAssumeBinOpBifurcation - Given the nodes in 'Src', eagerly assume symbolic
- /// expressions of the form 'x != 0' and generate new nodes (stored in Dst)
- /// with those assumptions.
- void evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
- const Expr *Ex);
+ /// evalEagerlyAssumeBifurcation - Given the nodes in 'Src', eagerly assume
+ /// concrete boolean values for 'Ex', storing the resulting nodes in 'Dst'.
+ void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
+ const Expr *Ex);
static std::pair<const ProgramPointTag *, const ProgramPointTag *>
- geteagerlyAssumeBinOpBifurcationTags();
+ getEagerlyAssumeBifurcationTags();
ProgramStateRef handleLValueBitCast(ProgramStateRef state, const Expr *Ex,
const LocationContext *LCtx, QualType T,
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index d9b67b7..d2d8907 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -1185,6 +1185,20 @@ SourceRange ClassTemplatePartialSpecializationDecl::getSourceRange() const {
return Range;
}
+ArrayRef<TemplateArgument>
+ClassTemplatePartialSpecializationDecl::getInjectedTemplateArgs() {
+ TemplateParameterList *Params = getTemplateParameters();
+ auto *First = cast<ClassTemplatePartialSpecializationDecl>(getFirstDecl());
+ if (!First->InjectedArgs) {
+ auto &Context = getASTContext();
+ SmallVector<TemplateArgument, 16> TemplateArgs;
+ Context.getInjectedTemplateArgs(Params, TemplateArgs);
+ First->InjectedArgs = new (Context) TemplateArgument[TemplateArgs.size()];
+ std::copy(TemplateArgs.begin(), TemplateArgs.end(), First->InjectedArgs);
+ }
+ return llvm::ArrayRef(First->InjectedArgs, Params->size());
+}
+
//===----------------------------------------------------------------------===//
// FriendTemplateDecl Implementation
//===----------------------------------------------------------------------===//
@@ -1535,6 +1549,20 @@ SourceRange VarTemplatePartialSpecializationDecl::getSourceRange() const {
return Range;
}
+ArrayRef<TemplateArgument>
+VarTemplatePartialSpecializationDecl::getInjectedTemplateArgs() {
+ TemplateParameterList *Params = getTemplateParameters();
+ auto *First = cast<VarTemplatePartialSpecializationDecl>(getFirstDecl());
+ if (!First->InjectedArgs) {
+ auto &Context = getASTContext();
+ SmallVector<TemplateArgument, 16> TemplateArgs;
+ Context.getInjectedTemplateArgs(Params, TemplateArgs);
+ First->InjectedArgs = new (Context) TemplateArgument[TemplateArgs.size()];
+ std::copy(TemplateArgs.begin(), TemplateArgs.end(), First->InjectedArgs);
+ }
+ return llvm::ArrayRef(First->InjectedArgs, Params->size());
+}
+
static TemplateParameterList *
createMakeIntegerSeqParameterList(const ASTContext &C, DeclContext *DC) {
// typename T
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 52a7f57..8544052 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -7237,6 +7237,7 @@ class APValueToBufferConverter {
case APValue::ComplexInt:
case APValue::ComplexFloat:
+ return visitComplex(Val, Ty, Offset);
case APValue::FixedPoint:
// FIXME: We should support these.
@@ -7323,6 +7324,31 @@ class APValueToBufferConverter {
return true;
}
+ bool visitComplex(const APValue &Val, QualType Ty, CharUnits Offset) {
+ const ComplexType *ComplexTy = Ty->castAs<ComplexType>();
+ QualType EltTy = ComplexTy->getElementType();
+ CharUnits EltSizeChars = Info.Ctx.getTypeSizeInChars(EltTy);
+ bool IsInt = Val.isComplexInt();
+
+ if (IsInt) {
+ if (!visitInt(Val.getComplexIntReal(), EltTy,
+ Offset + (0 * EltSizeChars)))
+ return false;
+ if (!visitInt(Val.getComplexIntImag(), EltTy,
+ Offset + (1 * EltSizeChars)))
+ return false;
+ } else {
+ if (!visitFloat(Val.getComplexFloatReal(), EltTy,
+ Offset + (0 * EltSizeChars)))
+ return false;
+ if (!visitFloat(Val.getComplexFloatImag(), EltTy,
+ Offset + (1 * EltSizeChars)))
+ return false;
+ }
+
+ return true;
+ }
+
bool visitVector(const APValue &Val, QualType Ty, CharUnits Offset) {
const VectorType *VTy = Ty->castAs<VectorType>();
QualType EltTy = VTy->getElementType();
@@ -7595,6 +7621,23 @@ class BufferToAPValueConverter {
return ArrayValue;
}
+ std::optional<APValue> visit(const ComplexType *Ty, CharUnits Offset) {
+ QualType ElementType = Ty->getElementType();
+ CharUnits ElementWidth = Info.Ctx.getTypeSizeInChars(ElementType);
+ bool IsInt = ElementType->isIntegerType();
+
+ std::optional<APValue> Values[2];
+ for (unsigned I = 0; I != 2; ++I) {
+ Values[I] = visitType(Ty->getElementType(), Offset + I * ElementWidth);
+ if (!Values[I])
+ return std::nullopt;
+ }
+
+ if (IsInt)
+ return APValue(Values[0]->getInt(), Values[1]->getInt());
+ return APValue(Values[0]->getFloat(), Values[1]->getFloat());
+ }
+
std::optional<APValue> visit(const VectorType *VTy, CharUnits Offset) {
QualType EltTy = VTy->getElementType();
unsigned NElts = VTy->getNumElements();
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index b56e2c7..88c0541 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -214,9 +214,11 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) {
Builder.defineMacro("_HAS_CHAR16_T_LANGUAGE_SUPPORT", Twine(1));
if (Opts.isCompatibleWithMSVC(LangOptions::MSVC2015)) {
- if (Opts.CPlusPlus23)
+ if (Opts.CPlusPlus26)
// TODO update to the proper value.
- Builder.defineMacro("_MSVC_LANG", "202004L");
+ Builder.defineMacro("_MSVC_LANG", "202400L");
+ else if (Opts.CPlusPlus23)
+ Builder.defineMacro("_MSVC_LANG", "202302L");
else if (Opts.CPlusPlus20)
Builder.defineMacro("_MSVC_LANG", "202002L");
else if (Opts.CPlusPlus17)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 12f99d9..f6d7db2 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5657,13 +5657,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
*Arg3 = EmitScalarExpr(E->getArg(3));
llvm::FunctionType *FTy = llvm::FunctionType::get(
Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+ Value *ACast = Builder.CreateAddrSpaceCast(Arg3, I8PTy);
// We know the third argument is an integer type, but we may need to cast
// it to i32.
if (Arg2->getType() != Int32Ty)
Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
return RValue::get(
EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
- {Arg0, Arg1, Arg2, Arg3, PacketSize, PacketAlign}));
+ {Arg0, Arg1, Arg2, ACast, PacketSize, PacketAlign}));
}
}
// OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
diff --git a/clang/lib/CodeGen/CodeGenABITypes.cpp b/clang/lib/CodeGen/CodeGenABITypes.cpp
index a6073e1..3f10d68 100644
--- a/clang/lib/CodeGen/CodeGenABITypes.cpp
+++ b/clang/lib/CodeGen/CodeGenABITypes.cpp
@@ -59,14 +59,23 @@ CodeGen::arrangeCXXMethodType(CodeGenModule &CGM,
return CGM.getTypes().arrangeCXXMethodType(RD, FTP, MD);
}
-const CGFunctionInfo &
-CodeGen::arrangeFreeFunctionCall(CodeGenModule &CGM,
- CanQualType returnType,
- ArrayRef<CanQualType> argTypes,
- FunctionType::ExtInfo info,
- RequiredArgs args) {
- return CGM.getTypes().arrangeLLVMFunctionInfo(returnType, FnInfoOpts::None,
- argTypes, info, {}, args);
+const CGFunctionInfo &CodeGen::arrangeCXXMethodCall(
+ CodeGenModule &CGM, CanQualType returnType, ArrayRef<CanQualType> argTypes,
+ FunctionType::ExtInfo info,
+ ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+ RequiredArgs args) {
+ return CGM.getTypes().arrangeLLVMFunctionInfo(
+ returnType, FnInfoOpts::IsInstanceMethod, argTypes, info, paramInfos,
+ args);
+}
+
+const CGFunctionInfo &CodeGen::arrangeFreeFunctionCall(
+ CodeGenModule &CGM, CanQualType returnType, ArrayRef<CanQualType> argTypes,
+ FunctionType::ExtInfo info,
+ ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+ RequiredArgs args) {
+ return CGM.getTypes().arrangeLLVMFunctionInfo(
+ returnType, FnInfoOpts::None, argTypes, info, paramInfos, args);
}
ImplicitCXXConstructorArgs
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b05ab36..b3e805a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -295,6 +295,7 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
return createCommonSPIRTargetCodeGenInfo(CGM);
case llvm::Triple::spirv32:
case llvm::Triple::spirv64:
+ case llvm::Triple::spirv:
return createSPIRVTargetCodeGenInfo(CGM);
case llvm::Triple::dxil:
return createDirectXTargetCodeGenInfo(CGM);
diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp
index 303a430..7935f7a 100644
--- a/clang/lib/CodeGen/Targets/DirectX.cpp
+++ b/clang/lib/CodeGen/Targets/DirectX.cpp
@@ -63,6 +63,7 @@ llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(CodeGenModule &CGM,
llvm_unreachable("dx.Sampler handles are not implemented yet");
break;
}
+ llvm_unreachable("Unknown llvm::dxil::ResourceClass enum");
}
} // namespace
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c132fa3..3fc3929 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7225,6 +7225,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
.Case("c++17", "-std=c++17")
.Case("c++20", "-std=c++20")
// TODO add c++23 and c++26 when MSVC supports it.
+ .Case("c++23preview", "-std=c++23")
.Case("c++latest", "-std=c++26")
.Default("");
if (LanguageStandard.empty())
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 3dd86ab..e662c3f 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1294,6 +1294,16 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
CmdArgs.push_back("-lFortranRuntime");
CmdArgs.push_back("-lFortranDecimal");
}
+
+ // libomp needs libatomic for atomic operations if using libgcc
+ if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+ options::OPT_fno_openmp, false)) {
+ Driver::OpenMPRuntimeKind OMPRuntime =
+ TC.getDriver().getOpenMPRuntime(Args);
+ ToolChain::RuntimeLibType RuntimeLib = TC.GetRuntimeLibType(Args);
+ if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc)
+ CmdArgs.push_back("-latomic");
+ }
}
void tools::addFortranRuntimeLibraryPath(const ToolChain &TC,
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 75d82c1..38f808a 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -18273,7 +18273,7 @@ bool Sema::CheckOverridingFunctionReturnType(const CXXMethodDecl *New,
}
// The return types aren't either both pointers or references to a class type.
- if (NewClassTy.isNull()) {
+ if (NewClassTy.isNull() || !NewClassTy->isStructureOrClassType()) {
Diag(New->getLocation(),
diag::err_different_return_type_for_overriding_virtual_function)
<< New->getDeclName() << NewTy << OldTy
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 22aedbc..d33b0d0 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2216,7 +2216,7 @@ ExprResult SemaOpenACC::CheckGangExpr(OpenACCGangKind GK, Expr *E) {
case OpenACCGangKind::Static:
return CheckGangStaticExpr(*this, E);
}
- }
+ } break;
default:
llvm_unreachable("Non compute construct in active compute construct?");
}
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 8c7f694..8665c09 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -237,7 +237,7 @@ struct TemplateInstantiationArgumentCollecter
if (Innermost)
AddInnermostTemplateArguments(VTPSD);
else if (ForConstraintInstantiation)
- AddOuterTemplateArguments(VTPSD, VTPSD->getTemplateArgs().asArray(),
+ AddOuterTemplateArguments(VTPSD, VTPSD->getInjectedTemplateArgs(),
/*Final=*/false);
if (VTPSD->isMemberSpecialization())
@@ -274,7 +274,7 @@ struct TemplateInstantiationArgumentCollecter
if (Innermost)
AddInnermostTemplateArguments(CTPSD);
else if (ForConstraintInstantiation)
- AddOuterTemplateArguments(CTPSD, CTPSD->getTemplateArgs().asArray(),
+ AddOuterTemplateArguments(CTPSD, CTPSD->getInjectedTemplateArgs(),
/*Final=*/false);
if (CTPSD->isMemberSpecialization())
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 68c8a8d..c4479db 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -2695,7 +2695,7 @@ ConditionBRVisitor::VisitNodeImpl(const ExplodedNode *N,
PathSensitiveBugReport &BR) {
ProgramPoint ProgPoint = N->getLocation();
const std::pair<const ProgramPointTag *, const ProgramPointTag *> &Tags =
- ExprEngine::geteagerlyAssumeBinOpBifurcationTags();
+ ExprEngine::getEagerlyAssumeBifurcationTags();
// If an assumption was made on a branch, it should be caught
// here by looking at the state transition.
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 43ab646..0e400df 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -2129,7 +2129,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
(B->isRelationalOp() || B->isEqualityOp())) {
ExplodedNodeSet Tmp;
VisitBinaryOperator(cast<BinaryOperator>(S), Pred, Tmp);
- evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, cast<Expr>(S));
+ evalEagerlyAssumeBifurcation(Dst, Tmp, cast<Expr>(S));
}
else
VisitBinaryOperator(cast<BinaryOperator>(S), Pred, Dst);
@@ -2402,7 +2402,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
if (AMgr.options.ShouldEagerlyAssume && (U->getOpcode() == UO_LNot)) {
ExplodedNodeSet Tmp;
VisitUnaryOperator(U, Pred, Tmp);
- evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, U);
+ evalEagerlyAssumeBifurcation(Dst, Tmp, U);
}
else
VisitUnaryOperator(U, Pred, Dst);
@@ -3742,23 +3742,20 @@ void ExprEngine::evalLocation(ExplodedNodeSet &Dst,
BldrTop.addNodes(Tmp);
}
-std::pair<const ProgramPointTag *, const ProgramPointTag*>
-ExprEngine::geteagerlyAssumeBinOpBifurcationTags() {
- static SimpleProgramPointTag
- eagerlyAssumeBinOpBifurcationTrue(TagProviderName,
- "Eagerly Assume True"),
- eagerlyAssumeBinOpBifurcationFalse(TagProviderName,
- "Eagerly Assume False");
- return std::make_pair(&eagerlyAssumeBinOpBifurcationTrue,
- &eagerlyAssumeBinOpBifurcationFalse);
+std::pair<const ProgramPointTag *, const ProgramPointTag *>
+ExprEngine::getEagerlyAssumeBifurcationTags() {
+ static SimpleProgramPointTag TrueTag(TagProviderName, "Eagerly Assume True"),
+ FalseTag(TagProviderName, "Eagerly Assume False");
+
+ return std::make_pair(&TrueTag, &FalseTag);
}
-void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
- ExplodedNodeSet &Src,
- const Expr *Ex) {
+void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst,
+ ExplodedNodeSet &Src,
+ const Expr *Ex) {
StmtNodeBuilder Bldr(Src, Dst, *currBldrCtx);
- for (const auto Pred : Src) {
+ for (ExplodedNode *Pred : Src) {
// Test if the previous node was as the same expression. This can happen
// when the expression fails to evaluate to anything meaningful and
// (as an optimization) we don't generate a node.
@@ -3767,28 +3764,26 @@ void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
continue;
}
- ProgramStateRef state = Pred->getState();
- SVal V = state->getSVal(Ex, Pred->getLocationContext());
+ ProgramStateRef State = Pred->getState();
+ SVal V = State->getSVal(Ex, Pred->getLocationContext());
std::optional<nonloc::SymbolVal> SEV = V.getAs<nonloc::SymbolVal>();
if (SEV && SEV->isExpression()) {
- const std::pair<const ProgramPointTag *, const ProgramPointTag*> &tags =
- geteagerlyAssumeBinOpBifurcationTags();
+ const auto &[TrueTag, FalseTag] = getEagerlyAssumeBifurcationTags();
- ProgramStateRef StateTrue, StateFalse;
- std::tie(StateTrue, StateFalse) = state->assume(*SEV);
+ auto [StateTrue, StateFalse] = State->assume(*SEV);
// First assume that the condition is true.
if (StateTrue) {
SVal Val = svalBuilder.makeIntVal(1U, Ex->getType());
StateTrue = StateTrue->BindExpr(Ex, Pred->getLocationContext(), Val);
- Bldr.generateNode(Ex, Pred, StateTrue, tags.first);
+ Bldr.generateNode(Ex, Pred, StateTrue, TrueTag);
}
// Next, assume that the condition is false.
if (StateFalse) {
SVal Val = svalBuilder.makeIntVal(0U, Ex->getType());
StateFalse = StateFalse->BindExpr(Ex, Pred->getLocationContext(), Val);
- Bldr.generateNode(Ex, Pred, StateFalse, tags.second);
+ Bldr.generateNode(Ex, Pred, StateFalse, FalseTag);
}
}
}
diff --git a/clang/test/Analysis/string.c b/clang/test/Analysis/string.c
index 79b4877..2e0a49d 100644
--- a/clang/test/Analysis/string.c
+++ b/clang/test/Analysis/string.c
@@ -361,6 +361,10 @@ void strcpy_fn_const(char *x) {
strcpy(x, (const char*)&strcpy_fn); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
}
+void strcpy_fn_dst(const char *x) {
+ strcpy((char*)&strcpy_fn, x); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
+}
+
extern int globalInt;
void strcpy_effects(char *x, char *y) {
char a = x[0];
@@ -469,8 +473,22 @@ void strcat_null_src(char *x) {
strcat(x, NULL); // expected-warning{{Null pointer passed as 2nd argument to string concatenation function}}
}
-void strcat_fn(char *x) {
- strcat(x, (char*)&strcat_fn); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn', which is not a null-terminated string}}
+void strcat_fn_dst(const char *x) {
+ strcat((char*)&strcat_fn_dst, x); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn_dst', which is not a null-terminated string}}
+}
+
+void strcat_fn_src(char *x) {
+ strcat(x, (char*)&strcat_fn_src); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn_src', which is not a null-terminated string}}
+}
+
+void strcat_label_dst(const char *x) {
+label:
+ strcat((char*)&&label, x); // expected-warning{{Argument to string concatenation function is the address of the label 'label', which is not a null-terminated string}}
+}
+
+void strcat_label_src(char *x) {
+label:
+ strcat(x, (char*)&&label); // expected-warning{{Argument to string concatenation function is the address of the label 'label', which is not a null-terminated string}}
}
void strcat_effects(char *y) {
@@ -568,8 +586,12 @@ void strncpy_null_src(char *x) {
strncpy(x, NULL, 5); // expected-warning{{Null pointer passed as 2nd argument to string copy function}}
}
-void strncpy_fn(char *x) {
- strncpy(x, (char*)&strcpy_fn, 5); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
+void strncpy_fn_src(char *x) {
+ strncpy(x, (char*)&strncpy_fn_src, 5); // expected-warning{{Argument to string copy function is the address of the function 'strncpy_fn_src', which is not a null-terminated string}}
+}
+
+void strncpy_fn_dst(const char *x) {
+ strncpy((char*)&strncpy_fn_dst, x, 5); // expected-warning{{Argument to string copy function is the address of the function 'strncpy_fn_dst', which is not a null-terminated string}}
}
void strncpy_effects(char *x, char *y) {
@@ -680,8 +702,12 @@ void strncat_null_src(char *x) {
strncat(x, NULL, 4); // expected-warning{{Null pointer passed as 2nd argument to string concatenation function}}
}
-void strncat_fn(char *x) {
- strncat(x, (char*)&strncat_fn, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn', which is not a null-terminated string}}
+void strncat_fn_src(char *x) {
+ strncat(x, (char*)&strncat_fn_src, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn_src', which is not a null-terminated string}}
+}
+
+void strncat_fn_dst(const char *x) {
+ strncat((char*)&strncat_fn_dst, x, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn_dst', which is not a null-terminated string}}
}
void strncat_effects(char *y) {
@@ -921,6 +947,14 @@ int strcmp_null_argument(char *a) {
return strcmp(a, b); // expected-warning{{Null pointer passed as 2nd argument to string comparison function}}
}
+void strcmp_fn_r(char *x) {
+ strcmp(x, (char*)&strcmp_null_argument); // expected-warning{{Argument to string comparison function is the address of the function 'strcmp_null_argument', which is not a null-terminated string}}
+}
+
+void strcmp_fn_l(char *x) {
+ strcmp((char*)&strcmp_null_argument, x); // expected-warning{{Argument to string comparison function is the address of the function 'strcmp_null_argument', which is not a null-terminated string}}
+}
+
//===----------------------------------------------------------------------===
// strncmp()
//===----------------------------------------------------------------------===
diff --git a/clang/test/Analysis/string.cpp b/clang/test/Analysis/string.cpp
index 1be6c21..c09422d 100644
--- a/clang/test/Analysis/string.cpp
+++ b/clang/test/Analysis/string.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,alpha.unix.cstring,debug.ExprInspection -verify %s
// Test functions that are called "memcpy" but aren't the memcpy
// we're looking for. Unfortunately, this test cannot be put into
@@ -6,6 +6,7 @@
// as a normal C function for the test to make sense.
typedef __typeof(sizeof(int)) size_t;
void *memcpy(void *, const void *, size_t);
+size_t strlen(const char *s);
int sprintf(char *str, const char *format, ...);
int snprintf(char *str, size_t size, const char *format, ...);
@@ -45,3 +46,10 @@ void log(const char* fmt, const Args&... args) {
void test_gh_74269_no_crash() {
log("%d", 1);
}
+
+struct TestNotNullTerm {
+ void test1() {
+ TestNotNullTerm * const &x = this;
+ strlen((char *)&x); // expected-warning{{Argument to string length function is not a null-terminated string}}
+ }
+};
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
index 70064f8..f144e14 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
@@ -1,175 +1,219 @@
// RUN: %clang_cc1 -std=c++20 -verify %s
// expected-no-diagnostics
-template<typename T>
-concept D = true;
+namespace Primary {
+ template<typename T>
+ concept D = true;
-template<typename T>
-struct A {
- template<typename U, bool V>
- void f() requires V;
+ template<typename T>
+ struct A {
+ template<typename U, bool V>
+ void f() requires V;
- template<>
- void f<short, true>();
+ template<>
+ void f<short, true>();
+
+ template<D U>
+ void g();
+
+ template<typename U, bool V> requires V
+ struct B;
+
+ template<typename U, bool V> requires V
+ struct B<U*, V>;
+
+ template<>
+ struct B<short, true>;
+
+ template<D U>
+ struct C;
+
+ template<D U>
+ struct C<U*>;
+ template<typename U, bool V> requires V
+ static int x;
+
+ template<typename U, bool V> requires V
+ static int x<U*, V>;
+
+ template<>
+ int x<short, true>;
+
+ template<D U>
+ static int y;
+
+ template<D U>
+ static int y<U*>;
+ };
+
+ template<typename T>
+ template<typename U, bool V>
+ void A<T>::f() requires V { }
+
+ template<typename T>
template<D U>
- void g();
+ void A<T>::g() { }
+ template<typename T>
template<typename U, bool V> requires V
- struct B;
+ struct A<T>::B { };
+ template<typename T>
template<typename U, bool V> requires V
- struct B<U*, V>;
+ struct A<T>::B<U*, V> { };
- template<>
- struct B<short, true>;
+ template<typename T>
+ template<typename U, bool V> requires V
+ struct A<T>::B<U&, V> { };
+ template<typename T>
template<D U>
- struct C;
+ struct A<T>::C { };
+ template<typename T>
template<D U>
- struct C<U*>;
+ struct A<T>::C<U*> { };
+ template<typename T>
template<typename U, bool V> requires V
- static int x;
+ int A<T>::x = 0;
+ template<typename T>
template<typename U, bool V> requires V
- static int x<U*, V>;
+ int A<T>::x<U*, V> = 0;
- template<>
- int x<short, true>;
+ template<typename T>
+ template<typename U, bool V> requires V
+ int A<T>::x<U&, V> = 0;
+ template<typename T>
template<D U>
- static int y;
+ int A<T>::y = 0;
+ template<typename T>
template<D U>
- static int y<U*>;
-};
-
-template<typename T>
-template<typename U, bool V>
-void A<T>::f() requires V { }
+ int A<T>::y<U*> = 0;
-template<typename T>
-template<D U>
-void A<T>::g() { }
-
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B { };
+ template<>
+ template<typename U, bool V>
+ void A<short>::f() requires V;
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B<U*, V> { };
+ template<>
+ template<>
+ void A<short>::f<int, true>();
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B<U&, V> { };
+ template<>
+ template<>
+ void A<void>::f<int, true>();
-template<typename T>
-template<D U>
-struct A<T>::C { };
+ template<>
+ template<D U>
+ void A<short>::g();
-template<typename T>
-template<D U>
-struct A<T>::C<U*> { };
+ template<>
+ template<typename U, bool V> requires V
+ struct A<int>::B;
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x = 0;
+ template<>
+ template<>
+ struct A<int>::B<int, true>;
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x<U*, V> = 0;
+ template<>
+ template<>
+ struct A<void>::B<int, true>;
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x<U&, V> = 0;
+ template<>
+ template<typename U, bool V> requires V
+ struct A<int>::B<U*, V>;
-template<typename T>
-template<D U>
-int A<T>::y = 0;
+ template<>
+ template<typename U, bool V> requires V
+ struct A<int>::B<U&, V>;
-template<typename T>
-template<D U>
-int A<T>::y<U*> = 0;
+ template<>
+ template<D U>
+ struct A<int>::C;
-template<>
-template<typename U, bool V>
-void A<short>::f() requires V;
+ template<>
+ template<D U>
+ struct A<int>::C<U*>;
-template<>
-template<>
-void A<short>::f<int, true>();
+ template<>
+ template<D U>
+ struct A<int>::C<U&>;
-template<>
-template<>
-void A<void>::f<int, true>();
+ template<>
+ template<typename U, bool V> requires V
+ int A<long>::x;
-template<>
-template<D U>
-void A<short>::g();
+ template<>
+ template<>
+ int A<long>::x<int, true>;
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B;
+ template<>
+ template<>
+ int A<void>::x<int, true>;
-template<>
-template<>
-struct A<int>::B<int, true>;
+ template<>
+ template<typename U, bool V> requires V
+ int A<long>::x<U*, V>;
-template<>
-template<>
-struct A<void>::B<int, true>;
+ template<>
+ template<typename U, bool V> requires V
+ int A<long>::x<U&, V>;
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B<U*, V>;
+ template<>
+ template<D U>
+ int A<long>::y;
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B<U&, V>;
+ template<>
+ template<D U>
+ int A<long>::y<U*>;
-template<>
-template<D U>
-struct A<int>::C;
+ template<>
+ template<D U>
+ int A<long>::y<U&>;
+} // namespace Primary
-template<>
-template<D U>
-struct A<int>::C<U*>;
+namespace Partial {
+ template<typename T, bool B>
+ struct A;
-template<>
-template<D U>
-struct A<int>::C<U&>;
+ template<bool U>
+ struct A<int, U>
+ {
+ template<typename V> requires U
+ void f();
-template<>
-template<typename U, bool V> requires V
-int A<long>::x;
+ template<typename V> requires U
+ static const int x;
-template<>
-template<>
-int A<long>::x<int, true>;
+ template<typename V> requires U
+ struct B;
+ };
-template<>
-template<>
-int A<void>::x<int, true>;
+ template<bool U>
+ template<typename V> requires U
+ void A<int, U>::f() { }
-template<>
-template<typename U, bool V> requires V
-int A<long>::x<U*, V>;
+ template<bool U>
+ template<typename V> requires U
+ constexpr int A<int, U>::x = 0;
-template<>
-template<typename U, bool V> requires V
-int A<long>::x<U&, V>;
+ template<bool U>
+ template<typename V> requires U
+ struct A<int, U>::B { };
-template<>
-template<D U>
-int A<long>::y;
+ template<>
+ template<typename V> requires true
+ void A<int, true>::f() { }
-template<>
-template<D U>
-int A<long>::y<U*>;
+ template<>
+ template<typename V> requires true
+ constexpr int A<int, true>::x = 1;
-template<>
-template<D U>
-int A<long>::y<U&>;
+ template<>
+ template<typename V> requires true
+ struct A<int, true>::B { };
+} // namespace Partial
diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
index 03e149d..093a199 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
@@ -10,27 +10,27 @@
// CHECK-LABEL: test_int
int test_int(int expr, uint idx) {
// CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
- // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+ // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
// CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]])
// CHECK: ret [[TY]] %[[RET]]
return WaveReadLaneAt(expr, idx);
}
// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i32([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i32([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i32([[TY]], i32) #[[#attr:]]
#ifdef __HLSL_ENABLE_16_BIT
// CHECK-LABEL: test_int16
int16_t test_int16(int16_t expr, uint idx) {
// CHECK-SPIRV: %[[#entry_tok1:]] = call token @llvm.experimental.convergence.entry()
- // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
+ // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
// CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]])
// CHECK: ret [[TY]] %[[RET]]
return WaveReadLaneAt(expr, idx);
}
// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i16([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i16([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i16([[TY]], i32) #[[#attr:]]
#endif
// Test basic lowering to runtime function call with array and float values.
@@ -38,37 +38,37 @@ int16_t test_int16(int16_t expr, uint idx) {
// CHECK-LABEL: test_half
half test_half(half expr, uint idx) {
// CHECK-SPIRV: %[[#entry_tok2:]] = call token @llvm.experimental.convergence.entry()
- // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ]
+ // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ]
// CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]])
// CHECK: ret [[TY]] %[[RET]]
return WaveReadLaneAt(expr, idx);
}
// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f16([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.f16([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.f16([[TY]], i32) #[[#attr:]]
// CHECK-LABEL: test_double
double test_double(double expr, uint idx) {
// CHECK-SPIRV: %[[#entry_tok3:]] = call token @llvm.experimental.convergence.entry()
- // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ]
+ // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ]
// CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]])
// CHECK: ret [[TY]] %[[RET]]
return WaveReadLaneAt(expr, idx);
}
// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f64([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.f64([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.f64([[TY]], i32) #[[#attr:]]
// CHECK-LABEL: test_floatv4
float4 test_floatv4(float4 expr, uint idx) {
// CHECK-SPIRV: %[[#entry_tok4:]] = call token @llvm.experimental.convergence.entry()
- // CHECK-SPIRV: %[[RET1:.*]] = call [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ]
+ // CHECK-SPIRV: %[[RET1:.*]] = call spir_func [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ]
// CHECK-DXIL: %[[RET1:.*]] = call [[TY1:.*]] @llvm.dx.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]])
// CHECK: ret [[TY1]] %[[RET1]]
return WaveReadLaneAt(expr, idx);
}
// CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
-// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
+// CHECK-SPIRV: declare spir_func [[TY1]] @llvm.spv.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl
index 0ed9a94..1cdefa8 100644
--- a/clang/test/CodeGenHLSL/builtins/sign.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl
@@ -202,19 +202,19 @@ int4 test_sign_int64_t4(int64_t4 p0) { return sign(p0); }
// CHECK: define [[FNATTRS]] i32 @
// CHECK: [[CMP:%.*]] = icmp eq i64 [[ARG:%.*]], 0
// CHECK: %hlsl.sign = select i1 [[CMP]], i32 0, i32 1
-int test_sign_int64_t(uint64_t p0) { return sign(p0); }
+int test_sign_uint64_t(uint64_t p0) { return sign(p0); }
// CHECK: define [[FNATTRS]] <2 x i32> @
// CHECK: [[CMP:%.*]] = icmp eq <2 x i64> [[ARG:%.*]], zeroinitializer
// CHECK: %hlsl.sign = select <2 x i1> [[CMP]], <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 1>
-int2 test_sign_int64_t2(uint64_t2 p0) { return sign(p0); }
+int2 test_sign_uint64_t2(uint64_t2 p0) { return sign(p0); }
// CHECK: define [[FNATTRS]] <3 x i32> @
// CHECK: [[CMP:%.*]] = icmp eq <3 x i64> [[ARG:%.*]], zeroinitializer
// CHECK: %hlsl.sign = select <3 x i1> [[CMP]], <3 x i32> zeroinitializer, <3 x i32> <i32 1, i32 1, i32 1>
-int3 test_sign_int64_t3(uint64_t3 p0) { return sign(p0); }
+int3 test_sign_uint64_t3(uint64_t3 p0) { return sign(p0); }
// CHECK: define [[FNATTRS]] <4 x i32> @
// CHECK: [[CMP:%.*]] = icmp eq <4 x i64> [[ARG:%.*]], zeroinitializer
// CHECK: %hlsl.sign = select <4 x i1> [[CMP]], <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-int4 test_sign_int64_t4(uint64_t4 p0) { return sign(p0); }
+int4 test_sign_uint64_t4(uint64_t4 p0) { return sign(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
index 6b053dc..3ab8048 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
@@ -17,7 +17,7 @@ void main() {
// CHECK: br i1 {{%.+}}, label %[[LABEL_IF_THEN:.+]], label %[[LABEL_IF_END:.+]]
// CHECK: [[LABEL_IF_THEN]]:
-// CHECK: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CT_LOOP]]) ]
+// CHECK: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CT_LOOP]]) ]
// CHECK: br label %[[LABEL_WHILE_END:.+]]
if (cond == 2) {
uint index = WaveGetLaneIndex();
@@ -33,7 +33,7 @@ void main() {
// CHECK: ret void
}
-// CHECK-DAG: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
+// CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
// CHECK-DAG: attributes [[A0]] = {{{.*}}convergent{{.*}}}
// CHECK-DAG: attributes [[A1]] = {{{.*}}convergent{{.*}}}
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
index 06a2715..8e1f2d6 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
@@ -9,13 +9,13 @@
// CHECK-SPIRV: define spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
// CHECK-DXIL: define noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
// CHECK-SPIRV: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry()
-// CHECK-SPIRV: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
+// CHECK-SPIRV: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
// CHECK-DXIL: call i32 @llvm.dx.wave.getlaneindex()
int test_1() {
return WaveGetLaneIndex();
}
-// CHECK-SPIRV: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
+// CHECK-SPIRV: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
// CHECK-DXIL: declare i32 @llvm.dx.wave.getlaneindex() [[A1:#[0-9]+]]
// CHECK-DAG: attributes [[A0]] = { {{.*}}convergent{{.*}} }
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
index 6ea80d6..12b120d 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
@@ -3,12 +3,12 @@
// CHECK: define spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
// CHECK: %[[C1:[0-9]+]] = call token @llvm.experimental.convergence.entry()
-// CHECK: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ]
+// CHECK: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ]
uint test_1() {
return WaveGetLaneIndex();
}
-// CHECK-DAG: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
+// CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
// CHECK: define spir_func noundef i32 @_Z6test_2v() [[A0]] {
// CHECK: %[[C2:[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
index 18860c3..2fb6def 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
@@ -13,7 +13,7 @@ void main() {
while (true) {
// CHECK-DXIL: %[[#]] = call i1 @llvm.dx.wave.is.first.lane()
-// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane()
+// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane()
// CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ]
if (WaveIsFirstLane()) {
break;
@@ -21,7 +21,7 @@ void main() {
}
// CHECK-DXIL: %[[#]] = call i1 @llvm.dx.wave.is.first.lane()
-// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane()
+// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane()
// CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ]
if (WaveIsFirstLane()) {
return;
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index bab0e21..7377b5b 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -1,9 +1,10 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -enable-var-scope -check-prefixes=ALL,X86 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN,AMDGCN20 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL1.2 -O0 -triple spir-unknown-unknown-unknown | FileCheck -enable-var-scope -check-prefixes=SPIR %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn -cl-ext=+__opencl_c_program_scope_global_variables | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN,AMDGCN20 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -check-prefixes=X86 %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN20 %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL1.2 -O0 -triple spir-unknown-unknown-unknown | FileCheck -check-prefixes=SPIR %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn -cl-ext=+__opencl_c_program_scope_global_variables | FileCheck -check-prefixes=AMDGCN30-GVAR %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN30 %s
typedef int int2 __attribute__((ext_vector_type(2)));
@@ -45,147 +46,1236 @@ struct LargeStructTwoMember {
struct LargeStructOneMember g_s;
#endif
-// X86-LABEL: define{{.*}} void @foo(ptr dead_on_unwind noalias writable sret(%struct.Mat4X4) align 4 %agg.result, ptr noundef byval(%struct.Mat3X3) align 4 %in)
-// AMDGCN-LABEL: define{{.*}} %struct.Mat4X4 @foo([9 x i32] %in.coerce)
+//
+// X86-LABEL: define void @foo(
+// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo(
+// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
+// AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]]
+//
+// AMDGCN20-LABEL: define dso_local %struct.Mat4X4 @foo(
+// AMDGCN20-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN20-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN20-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
+// AMDGCN20-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]]
+//
+// SPIR-LABEL: define dso_local spir_func void @foo(
+// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local %struct.Mat4X4 @foo(
+// AMDGCN30-GVAR-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
+// AMDGCN30-GVAR-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]]
+//
+// AMDGCN30-LABEL: define dso_local %struct.Mat4X4 @foo(
+// AMDGCN30-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN30-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN30-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
+// AMDGCN30-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]]
+//
Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
Mat4X4 out;
return out;
}
-// ALL-LABEL: define {{.*}} void @ker
-// Expect two mem copies: one for the argument "in", and one for
-// the return value.
-// X86: call void @llvm.memcpy.p0.p1.i32(ptr
-// X86: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1)
-
-// AMDGCN: load [9 x i32], ptr addrspace(1)
-// AMDGCN: call %struct.Mat4X4 @foo([9 x i32]
-// AMDGCN: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1)
+//
+// X86-LABEL: define spir_kernel void @ker(
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
+// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0
+// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker(
+// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+// AMDGCN20-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+// AMDGCN20-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @ker(
+// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
+// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0
+// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
+// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
+// SPIR-NEXT: call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]]
+// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker(
+// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+// AMDGCN30-GVAR-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+// AMDGCN30-GVAR-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker(
+// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+// AMDGCN30-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+// AMDGCN30-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+// AMDGCN30-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+// AMDGCN30-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+// AMDGCN30-NEXT: ret void
+//
kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
out[0] = foo(in[1]);
}
-// X86-LABEL: define{{.*}} void @foo_large(ptr dead_on_unwind noalias writable sret(%struct.Mat64X64) align 4 %agg.result, ptr noundef byval(%struct.Mat32X32) align 4 %in)
-// AMDGCN-LABEL: define{{.*}} void @foo_large(ptr addrspace(5) dead_on_unwind noalias writable sret(%struct.Mat64X64) align 4 %agg.result, ptr addrspace(5) noundef byref(%struct.Mat32X32) align 4 %{{.*}}
-// AMDGCN: %in = alloca %struct.Mat32X32, align 4, addrspace(5)
-// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 %in, ptr addrspace(5) align 4 %{{.*}}, i64 4096, i1 false)
+//
+// X86-LABEL: define void @foo_large(
+// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local void @foo_large(
+// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local void @foo_large(
+// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @foo_large(
+// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @foo_large(
+// AMDGCN30-GVAR-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local void @foo_large(
+// AMDGCN30-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN30-NEXT: ret void
+//
Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
Mat64X64 out;
return out;
}
-// ALL-LABEL: define {{.*}} void @ker_large
-// Expect two mem copies: one for the argument "in", and one for
-// the return value.
-// X86: call void @llvm.memcpy.p0.p1.i32(ptr
-// X86: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1)
-// AMDGCN: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1)
+//
+// X86-LABEL: define spir_kernel void @ker_large(
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
+// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0
+// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker_large(
+// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @ker_large(
+// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
+// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0
+// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
+// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
+// SPIR-NEXT: call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker_large(
+// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+// AMDGCN30-GVAR-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker_large(
+// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+// AMDGCN30-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+// AMDGCN30-NEXT: ret void
+//
kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) {
out[0] = foo_large(in[1]);
}
-// AMDGCN-LABEL: define{{.*}} void @FuncOneMember(<2 x i32> %u.coerce)
+//
+// X86-LABEL: define void @FuncOneMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false)
+// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[X]], align 8
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local void @FuncOneMember(
+// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local void @FuncOneMember(
+// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @FuncOneMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneMember(
+// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local void @FuncOneMember(
+// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
+// AMDGCN30-NEXT: ret void
+//
void FuncOneMember(struct StructOneMember u) {
u.x = (int2)(0, 0);
}
-// AMDGCN-LABEL: define{{.*}} void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %{{.*}}
-// AMDGCN: %u = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %u, ptr addrspace(5) align 8 %{{.*}}, i64 800, i1 false)
-// AMDGCN-NOT: addrspacecast
-// AMDGCN: store <2 x i32> %{{.*}}, ptr addrspace(5)
+//
+// X86-LABEL: define void @FuncOneLargeMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false)
+// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0
+// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember(
+// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember(
+// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @FuncOneLargeMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0
+// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneLargeMember(
+// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local void @FuncOneLargeMember(
+// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN30-NEXT: ret void
+//
void FuncOneLargeMember(struct LargeStructOneMember u) {
u.x[0] = (int2)(0, 0);
}
-// AMDGCN20-LABEL: define{{.*}} void @test_indirect_arg_globl()
-// AMDGCN20: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN20: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
-// AMDGCN20: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables))
+// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_globl(
+// AMDGCN20-SAME: ) #[[ATTR0]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
+// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_globl(
+// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
+// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: ret void
+//
void test_indirect_arg_globl(void) {
FuncOneLargeMember(g_s);
}
#endif
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @test_indirect_arg_local()
-// AMDGCN: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
-// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
+//
+// X86-LABEL: define spir_kernel void @test_indirect_arg_local(
+// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
+// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
+// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
+// AMDGCN20-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
+// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @test_indirect_arg_local(
+// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
+// SPIR-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
+// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
+// AMDGCN30-GVAR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
+// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
+// AMDGCN30-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
+// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-NEXT: ret void
+//
kernel void test_indirect_arg_local(void) {
local struct LargeStructOneMember l_s;
FuncOneLargeMember(l_s);
}
-// AMDGCN-LABEL: define{{.*}} void @test_indirect_arg_private()
-// AMDGCN: %[[p_s:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN-NOT: @llvm.memcpy
-// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[p_s]])
+//
+// X86-LABEL: define void @test_indirect_arg_private(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]]
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_private(
+// AMDGCN20-SAME: ) #[[ATTR0]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private(
+// SPIR-SAME: ) #[[ATTR0]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
+// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_private(
+// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local void @test_indirect_arg_private(
+// AMDGCN30-SAME: ) #[[ATTR0]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// AMDGCN30-NEXT: ret void
+//
void test_indirect_arg_private(void) {
struct LargeStructOneMember p_s;
FuncOneLargeMember(p_s);
}
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelOneMember
-// AMDGCN-SAME: (<2 x i32> %[[u_coerce:.*]])
-// AMDGCN: %[[u:.*]] = alloca %struct.StructOneMember, align 8, addrspace(5)
-// AMDGCN: %[[coerce_dive:.*]] = getelementptr inbounds nuw %struct.StructOneMember, ptr addrspace(5) %[[u]], i32 0, i32 0
-// AMDGCN: store <2 x i32> %[[u_coerce]], ptr addrspace(5) %[[coerce_dive]]
-// AMDGCN: call void @FuncOneMember(<2 x i32>
+//
+// X86-LABEL: define spir_kernel void @KernelOneMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
+// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
+// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
+// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
+// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN30-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
+// AMDGCN30-NEXT: ret void
+//
kernel void KernelOneMember(struct StructOneMember u) {
FuncOneMember(u);
}
-// SPIR: call void @llvm.memcpy.p0.p1.i32
-// SPIR-NOT: addrspacecast
+//
+// X86-LABEL: define spir_kernel void @KernelOneMemberSpir(
+// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
+// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
+// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
+// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
+// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMemberSpir(
+// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8
+// SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
+// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
+// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false)
+// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
+// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
+// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
+// AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
+// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
+// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
+// AMDGCN30-NEXT: ret void
+//
kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
FuncOneMember(*u);
}
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN: %[[U_ELEM:.*]] = getelementptr inbounds nuw %struct.LargeStructOneMember, ptr addrspace(5) %[[U]], i32 0, i32 0
-// AMDGCN: %[[EXTRACT:.*]] = extractvalue %struct.LargeStructOneMember %u.coerce, 0
-// AMDGCN: store [100 x <2 x i32>] %[[EXTRACT]], ptr addrspace(5) %[[U_ELEM]], align 8
-// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]])
+//
+// X86-LABEL: define spir_kernel void @KernelLargeOneMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
+// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
+// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN20-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
+// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-GVAR-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
+// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN30-NEXT: ret void
+//
kernel void KernelLargeOneMember(struct LargeStructOneMember u) {
FuncOneLargeMember(u);
}
-// AMDGCN-LABEL: define{{.*}} void @FuncTwoMember(<2 x i32> %u.coerce0, <2 x i32> %u.coerce1)
+//
+// X86-LABEL: define void @FuncTwoMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false)
+// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[Y]], align 8
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local void @FuncTwoMember(
+// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local void @FuncTwoMember(
+// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @FuncTwoMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[Y]], align 8
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @FuncTwoMember(
+// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local void @FuncTwoMember(
+// AMDGCN30-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
+// AMDGCN30-NEXT: ret void
+//
void FuncTwoMember(struct StructTwoMember u) {
u.y = (int2)(0, 0);
}
-// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember
-// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]])
-// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %[[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+//
+// X86-LABEL: define void @FuncLargeTwoMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8
+// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false)
+// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0
+// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember(
+// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember(
+// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @FuncLargeTwoMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0
+// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @FuncLargeTwoMember(
+// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local void @FuncLargeTwoMember(
+// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN30-NEXT: ret void
+//
void FuncLargeTwoMember(struct LargeStructTwoMember u) {
u.y[0] = (int2)(0, 0);
}
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelTwoMember
-// AMDGCN-SAME: (%struct.StructTwoMember %[[u_coerce:.*]])
-// AMDGCN: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5)
-// AMDGCN: %[[LD0:.*]] = load <2 x i32>, ptr addrspace(5)
-// AMDGCN: %[[LD1:.*]] = load <2 x i32>, ptr addrspace(5)
-// AMDGCN: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]])
+//
+// X86-LABEL: define spir_kernel void @KernelTwoMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
+// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
+// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN20-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN20-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN20-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelTwoMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
+// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN30-GVAR-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
+// AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN30-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN30-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN30-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
+// AMDGCN30-NEXT: ret void
+//
kernel void KernelTwoMember(struct StructTwoMember u) {
FuncTwoMember(u);
}
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeTwoMember
-// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]])
-// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
-// AMDGCN: %[[U_PTR0:.*]] = getelementptr inbounds nuw %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 0
-// AMDGCN: %[[EXTRACT0:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 0
-// AMDGCN: store [40 x <2 x i32>] %[[EXTRACT0]], ptr addrspace(5) %[[U_PTR0]]
-// AMDGCN: %[[U_PTR1:.*]] = getelementptr inbounds nuw %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 1
-// AMDGCN: %[[EXTRACT1:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 1
-// AMDGCN: store [20 x <2 x i32>] %[[EXTRACT1]], ptr addrspace(5) %[[U_PTR1]]
-// AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]])
+//
+// X86-LABEL: define spir_kernel void @KernelLargeTwoMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
+// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN-NEXT: ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
+// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-NEXT: [[ENTRY:.*:]]
+// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN20-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN20-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN20-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN20-NEXT: ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] {
+// SPIR-NEXT: [[ENTRY:.*:]]
+// SPIR-NEXT: call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// SPIR-NEXT: ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
+// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-GVAR-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN30-GVAR-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-GVAR-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT: ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
+// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-NEXT: [[ENTRY:.*:]]
+// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN30-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN30-NEXT: ret void
+//
kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
FuncLargeTwoMember(u);
}
+//.
+// X86: [[META4]] = !{i32 1, i32 1}
+// X86: [[META5]] = !{!"none", !"none"}
+// X86: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// X86: [[META7]] = !{!"", !""}
+// X86: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// X86: [[META9]] = !{}
+// X86: [[META10]] = !{i32 0}
+// X86: [[META11]] = !{!"none"}
+// X86: [[META12]] = !{!"struct StructOneMember"}
+// X86: [[META13]] = !{!""}
+// X86: [[META14]] = !{i32 1}
+// X86: [[META15]] = !{!"struct StructOneMember*"}
+// X86: [[META16]] = !{!"struct LargeStructOneMember"}
+// X86: [[META17]] = !{!"struct StructTwoMember"}
+// X86: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
+// AMDGCN: [[META4]] = !{i32 1, i32 1}
+// AMDGCN: [[META5]] = !{!"none", !"none"}
+// AMDGCN: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN: [[META7]] = !{!"", !""}
+// AMDGCN: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN: [[META9]] = !{}
+// AMDGCN: [[META10]] = !{i32 0}
+// AMDGCN: [[META11]] = !{!"none"}
+// AMDGCN: [[META12]] = !{!"struct StructOneMember"}
+// AMDGCN: [[META13]] = !{!""}
+// AMDGCN: [[META14]] = !{i32 1}
+// AMDGCN: [[META15]] = !{!"struct StructOneMember*"}
+// AMDGCN: [[META16]] = !{!"struct LargeStructOneMember"}
+// AMDGCN: [[META17]] = !{!"struct StructTwoMember"}
+// AMDGCN: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
+// AMDGCN20: [[META4]] = !{i32 1, i32 1}
+// AMDGCN20: [[META5]] = !{!"none", !"none"}
+// AMDGCN20: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN20: [[META7]] = !{!"", !""}
+// AMDGCN20: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN20: [[META9]] = !{}
+// AMDGCN20: [[META10]] = !{i32 0}
+// AMDGCN20: [[META11]] = !{!"none"}
+// AMDGCN20: [[META12]] = !{!"struct StructOneMember"}
+// AMDGCN20: [[META13]] = !{!""}
+// AMDGCN20: [[META14]] = !{i32 1}
+// AMDGCN20: [[META15]] = !{!"struct StructOneMember*"}
+// AMDGCN20: [[META16]] = !{!"struct LargeStructOneMember"}
+// AMDGCN20: [[META17]] = !{!"struct StructTwoMember"}
+// AMDGCN20: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
+// SPIR: [[META3]] = !{i32 1, i32 1}
+// SPIR: [[META4]] = !{!"none", !"none"}
+// SPIR: [[META5]] = !{!"Mat3X3*", !"Mat4X4*"}
+// SPIR: [[META6]] = !{!"", !""}
+// SPIR: [[META7]] = !{!"Mat32X32*", !"Mat64X64*"}
+// SPIR: [[META8]] = !{}
+// SPIR: [[META9]] = !{i32 0}
+// SPIR: [[META10]] = !{!"none"}
+// SPIR: [[META11]] = !{!"struct StructOneMember"}
+// SPIR: [[META12]] = !{!""}
+// SPIR: [[META13]] = !{i32 1}
+// SPIR: [[META14]] = !{!"struct StructOneMember*"}
+// SPIR: [[META15]] = !{!"struct LargeStructOneMember"}
+// SPIR: [[META16]] = !{!"struct StructTwoMember"}
+// SPIR: [[META17]] = !{!"struct LargeStructTwoMember"}
+//.
+// AMDGCN30-GVAR: [[META4]] = !{i32 1, i32 1}
+// AMDGCN30-GVAR: [[META5]] = !{!"none", !"none"}
+// AMDGCN30-GVAR: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN30-GVAR: [[META7]] = !{!"", !""}
+// AMDGCN30-GVAR: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN30-GVAR: [[META9]] = !{}
+// AMDGCN30-GVAR: [[META10]] = !{i32 0}
+// AMDGCN30-GVAR: [[META11]] = !{!"none"}
+// AMDGCN30-GVAR: [[META12]] = !{!"struct StructOneMember"}
+// AMDGCN30-GVAR: [[META13]] = !{!""}
+// AMDGCN30-GVAR: [[META14]] = !{i32 1}
+// AMDGCN30-GVAR: [[META15]] = !{!"struct StructOneMember*"}
+// AMDGCN30-GVAR: [[META16]] = !{!"struct LargeStructOneMember"}
+// AMDGCN30-GVAR: [[META17]] = !{!"struct StructTwoMember"}
+// AMDGCN30-GVAR: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
+// AMDGCN30: [[META4]] = !{i32 1, i32 1}
+// AMDGCN30: [[META5]] = !{!"none", !"none"}
+// AMDGCN30: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN30: [[META7]] = !{!"", !""}
+// AMDGCN30: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN30: [[META9]] = !{}
+// AMDGCN30: [[META10]] = !{i32 0}
+// AMDGCN30: [[META11]] = !{!"none"}
+// AMDGCN30: [[META12]] = !{!"struct StructOneMember"}
+// AMDGCN30: [[META13]] = !{!""}
+// AMDGCN30: [[META14]] = !{i32 1}
+// AMDGCN30: [[META15]] = !{!"struct StructOneMember*"}
+// AMDGCN30: [[META16]] = !{!"struct LargeStructOneMember"}
+// AMDGCN30: [[META17]] = !{!"struct StructTwoMember"}
+// AMDGCN30: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
diff --git a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
index f26495bc..c847f58 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
@@ -1,67 +1,107 @@
-// RUN: %clang_cc1 -O0 -cl-std=CL1.2 -triple amdgcn---amdgizcl -emit-llvm %s -o - | FileCheck -check-prefixes=CHECK,CL12 %s
-// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn---amdgizcl -emit-llvm %s -o - | FileCheck -check-prefixes=CHECK,CL20 %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -O0 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck -check-prefixes=CL12 %s
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck -check-prefixes=CL20 %s
-// CL12-LABEL: define{{.*}} void @func1(ptr addrspace(5) noundef %x)
-// CL20-LABEL: define{{.*}} void @func1(ptr noundef %x)
+// CL12-LABEL: define dso_local void @func1(
+// CL12-SAME: ptr addrspace(5) noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CL12-NEXT: [[ENTRY:.*:]]
+// CL12-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// CL12-NEXT: store ptr addrspace(5) [[X]], ptr addrspace(5) [[X_ADDR]], align 4
+// CL12-NEXT: [[TMP0:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[X_ADDR]], align 4
+// CL12-NEXT: store i32 1, ptr addrspace(5) [[TMP0]], align 4
+// CL12-NEXT: ret void
+//
+// CL20-LABEL: define dso_local void @func1(
+// CL20-SAME: ptr noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CL20-NEXT: [[ENTRY:.*:]]
+// CL20-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CL20-NEXT: store ptr [[X]], ptr addrspace(5) [[X_ADDR]], align 8
+// CL20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[X_ADDR]], align 8
+// CL20-NEXT: store i32 1, ptr [[TMP0]], align 4
+// CL20-NEXT: ret void
+//
void func1(int *x) {
- // CL12: %[[x_addr:.*]] = alloca ptr addrspace(5){{.*}}addrspace(5)
- // CL12: store ptr addrspace(5) %x, ptr addrspace(5) %[[x_addr]]
- // CL12: %[[r0:.*]] = load ptr addrspace(5), ptr addrspace(5) %[[x_addr]]
- // CL12: store i32 1, ptr addrspace(5) %[[r0]]
- // CL20: %[[x_addr:.*]] = alloca ptr{{.*}}addrspace(5)
- // CL20: store ptr %x, ptr addrspace(5) %[[x_addr]]
- // CL20: %[[r0:.*]] = load ptr, ptr addrspace(5) %[[x_addr]]
- // CL20: store i32 1, ptr %[[r0]]
*x = 1;
}
-// CHECK-LABEL: define{{.*}} void @func2()
+// CL12-LABEL: define dso_local void @func2(
+// CL12-SAME: ) #[[ATTR0]] {
+// CL12-NEXT: [[ENTRY:.*:]]
+// CL12-NEXT: [[LV1:%.*]] = alloca i32, align 4, addrspace(5)
+// CL12-NEXT: [[LV2:%.*]] = alloca i32, align 4, addrspace(5)
+// CL12-NEXT: [[LA:%.*]] = alloca [100 x i32], align 4, addrspace(5)
+// CL12-NEXT: [[LP1:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// CL12-NEXT: [[LP2:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// CL12-NEXT: [[LVC:%.*]] = alloca i32, align 4, addrspace(5)
+// CL12-NEXT: store i32 1, ptr addrspace(5) [[LV1]], align 4
+// CL12-NEXT: store i32 2, ptr addrspace(5) [[LV2]], align 4
+// CL12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL12-NEXT: store i32 3, ptr addrspace(5) [[ARRAYIDX]], align 4
+// CL12-NEXT: store ptr addrspace(5) [[LV1]], ptr addrspace(5) [[LP1]], align 4
+// CL12-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL12-NEXT: store ptr addrspace(5) [[ARRAYDECAY]], ptr addrspace(5) [[LP2]], align 4
+// CL12-NEXT: call void @func1(ptr addrspace(5) noundef [[LV1]]) #[[ATTR2:[0-9]+]]
+// CL12-NEXT: store i32 4, ptr addrspace(5) [[LVC]], align 4
+// CL12-NEXT: store i32 4, ptr addrspace(5) [[LV1]], align 4
+// CL12-NEXT: ret void
+//
+// CL20-LABEL: define dso_local void @func2(
+// CL20-SAME: ) #[[ATTR0]] {
+// CL20-NEXT: [[ENTRY:.*:]]
+// CL20-NEXT: [[LV1:%.*]] = alloca i32, align 4, addrspace(5)
+// CL20-NEXT: [[LV2:%.*]] = alloca i32, align 4, addrspace(5)
+// CL20-NEXT: [[LA:%.*]] = alloca [100 x i32], align 4, addrspace(5)
+// CL20-NEXT: [[LP1:%.*]] = alloca ptr, align 8, addrspace(5)
+// CL20-NEXT: [[LP2:%.*]] = alloca ptr, align 8, addrspace(5)
+// CL20-NEXT: [[LVC:%.*]] = alloca i32, align 4, addrspace(5)
+// CL20-NEXT: store i32 1, ptr addrspace(5) [[LV1]], align 4
+// CL20-NEXT: store i32 2, ptr addrspace(5) [[LV2]], align 4
+// CL20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL20-NEXT: store i32 3, ptr addrspace(5) [[ARRAYIDX]], align 4
+// CL20-NEXT: [[LV1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr
+// CL20-NEXT: store ptr [[LV1_ASCAST]], ptr addrspace(5) [[LP1]], align 8
+// CL20-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL20-NEXT: [[ARRAYDECAY_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr
+// CL20-NEXT: store ptr [[ARRAYDECAY_ASCAST]], ptr addrspace(5) [[LP2]], align 8
+// CL20-NEXT: [[LV1_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr
+// CL20-NEXT: call void @func1(ptr noundef [[LV1_ASCAST1]]) #[[ATTR2:[0-9]+]]
+// CL20-NEXT: store i32 4, ptr addrspace(5) [[LVC]], align 4
+// CL20-NEXT: store i32 4, ptr addrspace(5) [[LV1]], align 4
+// CL20-NEXT: ret void
+//
void func2(void) {
- // CHECK: %lv1 = alloca i32, align 4, addrspace(5)
- // CHECK: %lv2 = alloca i32, align 4, addrspace(5)
- // CHECK: %la = alloca [100 x i32], align 4, addrspace(5)
- // CL12: %lp1 = alloca ptr addrspace(5), align 4, addrspace(5)
- // CL12: %lp2 = alloca ptr addrspace(5), align 4, addrspace(5)
- // CL20: %lp1 = alloca ptr, align 8, addrspace(5)
- // CL20: %lp2 = alloca ptr, align 8, addrspace(5)
- // CHECK: %lvc = alloca i32, align 4, addrspace(5)
-
- // CHECK: store i32 1, ptr addrspace(5) %lv1
int lv1;
lv1 = 1;
- // CHECK: store i32 2, ptr addrspace(5) %lv2
+
int lv2 = 2;
- // CHECK: %[[arrayidx:.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) %la, i64 0, i64 0
- // CHECK: store i32 3, ptr addrspace(5) %[[arrayidx]], align 4
int la[100];
la[0] = 3;
- // CL12: store ptr addrspace(5) %lv1, ptr addrspace(5) %lp1, align 4
- // CL20: %[[r0:.*]] = addrspacecast ptr addrspace(5) %lv1 to ptr
- // CL20: store ptr %[[r0]], ptr addrspace(5) %lp1, align 8
int *lp1 = &lv1;
- // CHECK: %[[arraydecay:.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) %la, i64 0, i64 0
- // CL12: store ptr addrspace(5) %[[arraydecay]], ptr addrspace(5) %lp2, align 4
- // CL20: %[[r1:.*]] = addrspacecast ptr addrspace(5) %[[arraydecay]] to ptr
- // CL20: store ptr %[[r1]], ptr addrspace(5) %lp2, align 8
int *lp2 = la;
- // CL12: call void @func1(ptr addrspace(5) noundef %lv1)
- // CL20: %[[r2:.*]] = addrspacecast ptr addrspace(5) %lv1 to ptr
- // CL20: call void @func1(ptr noundef %[[r2]])
func1(&lv1);
- // CHECK: store i32 4, ptr addrspace(5) %lvc
- // CHECK: store i32 4, ptr addrspace(5) %lv1
const int lvc = 4;
lv1 = lvc;
}
-// CHECK-LABEL: define{{.*}} void @func3()
-// CHECK: %a = alloca [16 x [1 x float]], align 4, addrspace(5)
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 %a, i8 0, i64 64, i1 false)
+// CL12-LABEL: define dso_local void @func3(
+// CL12-SAME: ) #[[ATTR0]] {
+// CL12-NEXT: [[ENTRY:.*:]]
+// CL12-NEXT: [[A:%.*]] = alloca [16 x [1 x float]], align 4, addrspace(5)
+// CL12-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[A]], i8 0, i64 64, i1 false)
+// CL12-NEXT: ret void
+//
+// CL20-LABEL: define dso_local void @func3(
+// CL20-SAME: ) #[[ATTR0]] {
+// CL20-NEXT: [[ENTRY:.*:]]
+// CL20-NEXT: [[A:%.*]] = alloca [16 x [1 x float]], align 4, addrspace(5)
+// CL20-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[A]], i8 0, i64 64, i1 false)
+// CL20-NEXT: ret void
+//
void func3(void) {
float a[16][1] = {{0.}};
}
diff --git a/clang/test/CodeGenOpenCL/pipe_builtin.cl b/clang/test/CodeGenOpenCL/pipe_builtin.cl
index c59f63b..ec9d7cb 100644
--- a/clang/test/CodeGenOpenCL/pipe_builtin.cl
+++ b/clang/test/CodeGenOpenCL/pipe_builtin.cl
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=clc++ -o - %s | FileCheck --check-prefix=CHECK-SPIR %s
// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=clc++ -o - %s | FileCheck %s
// FIXME: Add MS ABI manglings of OpenCL things and remove %itanium_abi_triple
// above to support OpenCL in the MS C++ ABI.
@@ -5,65 +6,85 @@
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
void test1(read_only pipe int p, global int *ptr) {
+ // CHECK-SPIR: call spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
// CHECK: call i32 @__read_pipe_2(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
read_pipe(p, ptr);
+ // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
// CHECK: call ptr @__reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
reserve_id_t rid = reserve_read_pipe(p, 2);
+ // CHECK-SPIR: call spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
// CHECK: call i32 @__read_pipe_4(ptr %{{.*}}, ptr %{{.*}}, i32 {{.*}}, ptr %{{.*}}, i32 4, i32 4)
read_pipe(p, rid, 2, ptr);
+ // CHECK-SPIR: call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
// CHECK: call void @__commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
commit_read_pipe(p, rid);
}
void test2(write_only pipe int p, global int *ptr) {
+ // CHECK-SPIR: call spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
// CHECK: call i32 @__write_pipe_2(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
write_pipe(p, ptr);
+ // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
// CHECK: call ptr @__reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
reserve_id_t rid = reserve_write_pipe(p, 2);
+ // CHECK-SPIR: call spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
// CHECK: call i32 @__write_pipe_4(ptr %{{.*}}, ptr %{{.*}}, i32 {{.*}}, ptr %{{.*}}, i32 4, i32 4)
write_pipe(p, rid, 2, ptr);
+ // CHECK-SPIR: call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
// CHECK: call void @__commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
commit_write_pipe(p, rid);
}
void test3(read_only pipe int p, global int *ptr) {
+ // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
// CHECK: call ptr @__work_group_reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
reserve_id_t rid = work_group_reserve_read_pipe(p, 2);
+ // CHECK-SPIR: call spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
// CHECK: call void @__work_group_commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
work_group_commit_read_pipe(p, rid);
}
void test4(write_only pipe int p, global int *ptr) {
+ // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
// CHECK: call ptr @__work_group_reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
reserve_id_t rid = work_group_reserve_write_pipe(p, 2);
+ // CHECK-SPIR: call spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
// CHECK: call void @__work_group_commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
work_group_commit_write_pipe(p, rid);
}
void test5(read_only pipe int p, global int *ptr) {
+ // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
// CHECK: call ptr @__sub_group_reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
reserve_id_t rid = sub_group_reserve_read_pipe(p, 2);
+ // CHECK-SPIR: call spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
// CHECK: call void @__sub_group_commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
sub_group_commit_read_pipe(p, rid);
}
void test6(write_only pipe int p, global int *ptr) {
+ // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
// CHECK: call ptr @__sub_group_reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
reserve_id_t rid = sub_group_reserve_write_pipe(p, 2);
+ // CHECK-SPIR: call spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
// CHECK: call void @__sub_group_commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
sub_group_commit_write_pipe(p, rid);
}
void test7(read_only pipe int p, global int *ptr) {
+ // CHECK-SPIR: call spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0) %{{.*}}, i32 4, i32 4)
// CHECK: call i32 @__get_pipe_num_packets_ro(ptr %{{.*}}, i32 4, i32 4)
*ptr = get_pipe_num_packets(p);
+ // CHECK-SPIR: call spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0) %{{.*}}, i32 4, i32 4)
// CHECK: call i32 @__get_pipe_max_packets_ro(ptr %{{.*}}, i32 4, i32 4)
*ptr = get_pipe_max_packets(p);
}
void test8(write_only pipe int p, global int *ptr) {
+ // CHECK-SPIR: call spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1) %{{.*}}, i32 4, i32 4)
// CHECK: call i32 @__get_pipe_num_packets_wo(ptr %{{.*}}, i32 4, i32 4)
*ptr = get_pipe_num_packets(p);
+ // CHECK-SPIR: call spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1) %{{.*}}, i32 4, i32 4)
// CHECK: call i32 @__get_pipe_max_packets_wo(ptr %{{.*}}, i32 4, i32 4)
*ptr = get_pipe_max_packets(p);
}
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 48d281b..8191fda 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -605,6 +605,9 @@
// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++20 -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX20 %s
// STDCXX20: -std=c++20
+// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++23preview -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX23PREVIEW %s
+// STDCXX23PREVIEW: -std=c++23
+
// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++latest -### -- %s 2>&1 | FileCheck -check-prefix=STDCXXLATEST %s
// STDCXXLATEST: -std=c++26
diff --git a/clang/test/ExtractAPI/anonymous_record_no_typedef.c b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
index 064c223..c0c76ef 100644
--- a/clang/test/ExtractAPI/anonymous_record_no_typedef.c
+++ b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
@@ -1,11 +1,18 @@
// RUN: rm -rf %t
// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
// RUN: -triple arm64-apple-macosx -isystem %S -fretain-comments-from-system-headers \
-// RUN: -x c-header %s -o %t/output.symbols.json -verify
+// RUN: -x c-header %s -o %t/output-c.symbols.json -verify
+//
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN: -triple arm64-apple-macosx -isystem %S -fretain-comments-from-system-headers \
+// RUN: -x c++-header %s -o %t/output-cxx.symbols.json -verify
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBAL
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix PREFIX
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix CONTENT
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBAL
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix PREFIX
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix CONTENT
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBAL
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix PREFIX
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix CONTENT
/// A global variable with an anonymous struct type.
struct { char *prefix; char *content; } global;
// GLOBAL-LABEL: "!testLabel": "c:@global"
@@ -30,7 +37,7 @@ struct { char *prefix; char *content; } global;
// GLOBAL: "text": "A global variable with an anonymous struct type."
// GLOBAL: "kind": {
// GLOBAL-NEXT: "displayName": "Global Variable",
-// GLOBAL-NEXT: "identifier": "c.var"
+// GLOBAL-NEXT: "identifier": "c{{(\+\+)?}}.var"
// GLOBAL: "title": "global"
// GLOBAL: "pathComponents": [
// GLOBAL-NEXT: "global"
@@ -54,9 +61,12 @@ struct { char *prefix; char *content; } global;
/// A Vehicle
struct Vehicle {
- // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix TYPE
- // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix BICYCLE
- // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix CAR
+ // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix TYPE
+ // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix BICYCLE
+ // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix CAR
+ // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix TYPE
+ // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix BICYCLE
+ // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix CAR
/// The type of vehicle.
enum {
Bicycle,
@@ -96,9 +106,12 @@ struct Vehicle {
// CAR-NEXT: "Car"
// CAR-NEXT: ]
- // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix INFORMATION
- // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix WHEELS
- // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix NAME
+ // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix INFORMATION
+ // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix WHEELS
+ // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix NAME
+ // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix INFORMATION
+ // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix WHEELS
+ // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix NAME
/// The information about the vehicle.
union {
int wheels;
@@ -145,8 +158,10 @@ struct Vehicle {
// NAME-NEXT: ]
};
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALCASE
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALOTHERCASE
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBALCASE
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBALOTHERCASE
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBALCASE
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBALOTHERCASE
enum {
GlobalCase,
GlobalOtherCase
@@ -163,7 +178,8 @@ enum {
// GLOBALOTHERCASE-NEXT: "GlobalOtherCase"
// GLOBALOTHERCASE-NEXT: ]
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix VEC
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix VEC
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix VEC
union Vector {
struct {
float X;
diff --git a/clang/test/ExtractAPI/typedef_anonymous_record.c b/clang/test/ExtractAPI/typedef_anonymous_record.c
index 8e298f8..c100e30 100644
--- a/clang/test/ExtractAPI/typedef_anonymous_record.c
+++ b/clang/test/ExtractAPI/typedef_anonymous_record.c
@@ -1,8 +1,11 @@
// RUN: rm -rf %t
// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
-// RUN: --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain.symbols.json -verify
+// RUN: --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain-c.symbols.json -verify
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN: --product-name=TypedefChain -triple arm64-apple-macosx -x c++-header %s -o %t/typedefchain-cxx.symbols.json -verify
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCT
typedef struct { } MyStruct;
// MYSTRUCT-LABEL: "!testLabel": "c:@SA@MyStruct"
// MYSTRUCT: "accessLevel": "public",
@@ -34,7 +37,7 @@ typedef struct { } MyStruct;
// MYSTRUCT-NEXT: ]
// MYSTRUCT: "kind": {
// MYSTRUCT-NEXT: "displayName": "Structure",
-// MYSTRUCT-NEXT: "identifier": "c.struct"
+// MYSTRUCT-NEXT: "identifier": "c{{(\+\+)?}}.struct"
// MYSTRUCT: "names": {
// MYSTRUCT-NEXT: "navigator": [
// MYSTRUCT-NEXT: {
@@ -54,7 +57,8 @@ typedef struct { } MyStruct;
// MYSTRUCT-NEXT: "MyStruct"
// MYSTRUCT-NEXT: ]
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYSTRUCTSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCTSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCTSTRUCT
typedef MyStruct MyStructStruct;
// MYSTRUCTSTRUCT-LABEL: "!testLabel": "c:typedef_anonymous_record.c@T@MyStructStruct"
// MYSTRUCTSTRUCT: "accessLevel": "public",
@@ -87,10 +91,12 @@ typedef MyStruct MyStructStruct;
// MYSTRUCTSTRUCT-NEXT:],
// MYSTRUCTSTRUCT: "kind": {
// MYSTRUCTSTRUCT-NEXT: "displayName": "Type Alias",
-// MYSTRUCTSTRUCT-NEXT: "identifier": "c.typealias"
+// MYSTRUCTSTRUCT-NEXT: "identifier": "c{{(\+\+)?}}.typealias"
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYENUM
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix CASE
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix CASE
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix CASE
typedef enum { Case } MyEnum;
// MYENUM: "source": "c:@EA@MyEnum@Case",
// MYENUM-NEXT: "target": "c:@EA@MyEnum",
@@ -124,7 +130,7 @@ typedef enum { Case } MyEnum;
// MYENUM-NEXT:],
// MYENUM: "kind": {
// MYENUM-NEXT: "displayName": "Enumeration",
-// MYENUM-NEXT: "identifier": "c.enum"
+// MYENUM-NEXT: "identifier": "c{{(\+\+)?}}.enum"
// MYENUM: "names": {
// MYENUM-NEXT: "navigator": [
// MYENUM-NEXT: {
@@ -147,7 +153,8 @@ typedef enum { Case } MyEnum;
// CASE-NEXT: "Case"
// CASE-NEXT: ]
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYENUMENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYENUMENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYENUMENUM
typedef MyEnum MyEnumEnum;
// MYENUMENUM-LABEL: "!testLabel": "c:typedef_anonymous_record.c@T@MyEnumEnum"
// MYENUMENUM: "declarationFragments": [
@@ -179,7 +186,7 @@ typedef MyEnum MyEnumEnum;
// MYENUMENUM-NEXT: ],
// MYENUMENUM: "kind": {
// MYENUMENUM-NEXT: "displayName": "Type Alias",
-// MYENUMENUM-NEXT: "identifier": "c.typealias"
+// MYENUMENUM-NEXT: "identifier": "c{{(\+\+)?}}.typealias"
// MYENUMENUM-NEXT: },
// MYENUMENUM: "title": "MyEnumEnum"
diff --git a/clang/test/Modules/friend-definition-2.cpp b/clang/test/Modules/friend-definition-2.cpp
index 41c2141f..d91ce14 100644
--- a/clang/test/Modules/friend-definition-2.cpp
+++ b/clang/test/Modules/friend-definition-2.cpp
@@ -1,32 +1,53 @@
-// RUN: %clang_cc1 -std=c++14 -fmodules %s -verify
-// RUN: %clang_cc1 -std=c++14 -fmodules %s -verify -triple i686-windows
-// expected-no-diagnostics
-#pragma clang module build A
-module A {}
-#pragma clang module contents
-#pragma clang module begin A
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=A -emit-module %t/a.modulemap -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=B -emit-module %t/b.modulemap -o %t/b.pcm
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap \
+// RUN: -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm \
+// RUN: %t/use.cc -verify
+
+// RUN: rm -f %t/*.pcm
+
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=A -emit-module %t/a.modulemap -o %t/a.pcm -triple i686-windows
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=B -emit-module %t/b.modulemap -o %t/b.pcm -triple i686-windows
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap \
+// RUN: -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm \
+// RUN: %t/use.cc -verify -triple i686-windows
+
+//--- a.modulemap
+module A {
+ header "a.h"
+}
+
+//--- a.h
+#ifndef A_H
+#define A_H
+template<typename T> struct ct { friend auto operator-(ct, ct) { struct X {}; return X(); } void x(); };
+#endif
+
+//--- b.modulemap
+module B {
+ header "b.h"
+}
+
+//--- b.h
+#ifndef B_H
+#define B_H
template<typename T> struct ct { friend auto operator-(ct, ct) { struct X {}; return X(); } void x(); };
-#pragma clang module end
-#pragma clang module endbuild
-
-#pragma clang module build B
-module B {}
-#pragma clang module contents
-#pragma clang module begin B
-template<typename T> struct ct { friend auto operator-(ct, ct) { struct X{}; return X(); } void x(); };
inline auto f() { return ct<float>() - ct<float>(); }
-#pragma clang module end
-#pragma clang module endbuild
+#endif
+//--- use.cc
+// expected-no-diagnostics
// Force the definition of ct in module A to be the primary definition.
-#pragma clang module import A
+#include "a.h"
template<typename T> void ct<T>::x() {}
// Attempt to cause the definition of operator- in the ct primary template in
// module B to be the primary definition of that function. If that happens,
// we'll be left with a class template ct that appears to not contain a
// definition of the inline friend function.
-#pragma clang module import B
+#include "b.h"
auto v = f();
ct<int> make();
diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c
index 7d29e45..8e539a2 100644
--- a/clang/test/Preprocessor/predefined-win-macros.c
+++ b/clang/test/Preprocessor/predefined-win-macros.c
@@ -56,7 +56,12 @@
// RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
// RUN: -fms-compatibility-version=19.00 -std=c++23 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2B
// CHECK-MS-CPP2B: #define _MSC_VER 1900
-// CHECK-MS-CPP2B: #define _MSVC_LANG 202004L
+// CHECK-MS-CPP2B: #define _MSVC_LANG 202302L
+
+// RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
+// RUN: -fms-compatibility-version=19.00 -std=c++26 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2C
+// CHECK-MS-CPP2C: #define _MSC_VER 1900
+// CHECK-MS-CPP2C: #define _MSVC_LANG 202400L
// RUN: %clang_cc1 -triple i386-windows %s -E -dM -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-X86-WIN
diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
index 7520b43..5ddb77b 100644
--- a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
+++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
@@ -511,3 +511,19 @@ constexpr bool9 bad_short_to_bool9 = __builtin_bit_cast(bool9, static_cast<unsig
constexpr bool17 bad_int_to_bool17 = __builtin_bit_cast(bool17, 0x0001CAFEU);
}
+
+namespace test_complex {
+ constexpr _Complex unsigned test_int_complex = { 0x0C05FEFE, 0xCAFEBABE };
+ static_assert(round_trip<_Complex unsigned>(0xCAFEBABE0C05FEFEULL), "");
+ static_assert(bit_cast<unsigned long long>(test_int_complex) == (LITTLE_END
+ ? 0xCAFEBABE0C05FEFE
+ : 0x0C05FEFECAFEBABE), "");
+ static_assert(sizeof(double) == 2 * sizeof(float));
+ struct TwoFloats { float A; float B; };
+ constexpr _Complex float test_float_complex = {1.0f, 2.0f};
+ constexpr TwoFloats TF = __builtin_bit_cast(TwoFloats, test_float_complex);
+ static_assert(TF.A == 1.0f && TF.B == 2.0f);
+
+ constexpr double D = __builtin_bit_cast(double, test_float_complex);
+ constexpr int M = __builtin_bit_cast(int, test_int_complex); // expected-error {{__builtin_bit_cast source size does not equal destination size}}
+}
diff --git a/clang/test/SemaCXX/virtual-override.cpp b/clang/test/SemaCXX/virtual-override.cpp
index 72abfc3..d37c275 100644
--- a/clang/test/SemaCXX/virtual-override.cpp
+++ b/clang/test/SemaCXX/virtual-override.cpp
@@ -19,10 +19,12 @@ struct b { };
class A {
virtual a* f(); // expected-note{{overridden virtual function is here}}
+ virtual int *g(); // expected-note{{overridden virtual function is here}}
};
class B : A {
virtual b* f(); // expected-error{{return type of virtual function 'f' is not covariant with the return type of the function it overrides ('b *' is not derived from 'a *')}}
+ virtual char *g(); // expected-error{{virtual function 'g' has a different return type ('char *') than the function it overrides (which has return type 'int *')}}
};
}
@@ -83,11 +85,15 @@ struct a { };
class A {
virtual const a* f();
virtual a* g(); // expected-note{{overridden virtual function is here}}
+ virtual const int* h(); // expected-note{{overridden virtual function is here}}
+ virtual int* i(); // expected-note{{overridden virtual function is here}}
};
class B : A {
virtual a* f();
virtual const a* g(); // expected-error{{return type of virtual function 'g' is not covariant with the return type of the function it overrides (class type 'const a *' is more qualified than class type 'a *'}}
+ virtual int* h(); // expected-error{{virtual function 'h' has a different return type ('int *') than the function it overrides (which has return type 'const int *')}}
+ virtual const int* i(); // expected-error{{virtual function 'i' has a different return type ('const int *') than the function it overrides (which has return type 'int *')}}
};
}
diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
index 12fee5d..4e1819b 100644
--- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
@@ -7,6 +7,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests
ArenaTest.cpp
ASTOpsTest.cpp
CFGMatchSwitchTest.cpp
+ CachedConstAccessorsLatticeTest.cpp
ChromiumCheckModelTest.cpp
DataflowAnalysisContextTest.cpp
DataflowEnvironmentTest.cpp
diff --git a/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
new file mode 100644
index 0000000..6488833
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
@@ -0,0 +1,305 @@
+//===- unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h"
+
+#include <cassert>
+#include <memory>
+
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
+#include "clang/Analysis/FlowSensitive/DataflowLattice.h"
+#include "clang/Analysis/FlowSensitive/NoopLattice.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Analysis/FlowSensitive/Value.h"
+#include "clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Testing/TestAST.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang::dataflow {
+namespace {
+
+using ast_matchers::BoundNodes;
+using ast_matchers::callee;
+using ast_matchers::cxxMemberCallExpr;
+using ast_matchers::functionDecl;
+using ast_matchers::hasName;
+using ast_matchers::match;
+using ast_matchers::selectFirst;
+
+using dataflow::DataflowAnalysisContext;
+using dataflow::Environment;
+using dataflow::LatticeJoinEffect;
+using dataflow::RecordStorageLocation;
+using dataflow::Value;
+using dataflow::WatchedLiteralsSolver;
+
+using testing::SizeIs;
+
+NamedDecl *lookup(StringRef Name, const DeclContext &DC) {
+ auto Result = DC.lookup(&DC.getParentASTContext().Idents.get(Name));
+ EXPECT_TRUE(Result.isSingleResult()) << Name;
+ return Result.front();
+}
+
+class CachedConstAccessorsLatticeTest : public ::testing::Test {
+protected:
+ using LatticeT = CachedConstAccessorsLattice<NoopLattice>;
+
+ DataflowAnalysisContext DACtx{std::make_unique<WatchedLiteralsSolver>()};
+ Environment Env{DACtx};
+};
+
+// Basic test AST with two const methods (return a value, and return a ref).
+struct CommonTestInputs {
+ CommonTestInputs()
+ : AST(R"cpp(
+ struct S {
+ int *valProperty() const;
+ int &refProperty() const;
+ };
+ void target() {
+ S s;
+ s.valProperty();
+ S s2;
+ s2.refProperty();
+ }
+ )cpp") {
+ auto *SDecl = cast<CXXRecordDecl>(
+ lookup("S", *AST.context().getTranslationUnitDecl()));
+ SType = AST.context().getRecordType(SDecl);
+ CallVal = selectFirst<CallExpr>(
+ "call",
+ match(cxxMemberCallExpr(callee(functionDecl(hasName("valProperty"))))
+ .bind("call"),
+ AST.context()));
+ assert(CallVal != nullptr);
+
+ CallRef = selectFirst<CallExpr>(
+ "call",
+ match(cxxMemberCallExpr(callee(functionDecl(hasName("refProperty"))))
+ .bind("call"),
+ AST.context()));
+ assert(CallRef != nullptr);
+ }
+
+ TestAST AST;
+ QualType SType;
+ const CallExpr *CallVal;
+ const CallExpr *CallRef;
+};
+
+TEST_F(CachedConstAccessorsLatticeTest,
+ SamePrimitiveValBeforeClearOrDiffAfterClear) {
+ CommonTestInputs Inputs;
+ auto *CE = Inputs.CallVal;
+ RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+ {});
+
+ LatticeT Lattice;
+ Value *Val1 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+ Value *Val2 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+ EXPECT_EQ(Val1, Val2);
+
+ Lattice.clearConstMethodReturnValues(Loc);
+ Value *Val3 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+ EXPECT_NE(Val3, Val1);
+ EXPECT_NE(Val3, Val2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, SameLocBeforeClearOrDiffAfterClear) {
+ CommonTestInputs Inputs;
+ auto *CE = Inputs.CallRef;
+ RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+ {});
+
+ LatticeT Lattice;
+ auto NopInit = [](StorageLocation &) {};
+ StorageLocation *Loc1 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+ Loc, CE, Env, NopInit);
+ auto NotCalled = [](StorageLocation &) {
+ ASSERT_TRUE(false) << "Not reached";
+ };
+ StorageLocation *Loc2 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+ Loc, CE, Env, NotCalled);
+
+ EXPECT_EQ(Loc1, Loc2);
+
+ Lattice.clearConstMethodReturnStorageLocations(Loc);
+ StorageLocation *Loc3 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+ Loc, CE, Env, NopInit);
+
+ EXPECT_NE(Loc3, Loc1);
+ EXPECT_NE(Loc3, Loc2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest,
+ SameStructValBeforeClearOrDiffAfterClear) {
+ TestAST AST(R"cpp(
+ struct S {
+ S structValProperty() const;
+ };
+ void target() {
+ S s;
+ s.structValProperty();
+ }
+ )cpp");
+ auto *SDecl =
+ cast<CXXRecordDecl>(lookup("S", *AST.context().getTranslationUnitDecl()));
+ QualType SType = AST.context().getRecordType(SDecl);
+ const CallExpr *CE = selectFirst<CallExpr>(
+ "call", match(cxxMemberCallExpr(
+ callee(functionDecl(hasName("structValProperty"))))
+ .bind("call"),
+ AST.context()));
+ ASSERT_NE(CE, nullptr);
+
+ RecordStorageLocation Loc(SType, RecordStorageLocation::FieldToLoc(), {});
+
+ LatticeT Lattice;
+ // Accessors that return a record by value are modeled by a record storage
+ // location (instead of a Value).
+ auto NopInit = [](StorageLocation &) {};
+ StorageLocation *Loc1 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+ Loc, CE, Env, NopInit);
+ auto NotCalled = [](StorageLocation &) {
+ ASSERT_TRUE(false) << "Not reached";
+ };
+ StorageLocation *Loc2 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+ Loc, CE, Env, NotCalled);
+
+ EXPECT_EQ(Loc1, Loc2);
+
+ Lattice.clearConstMethodReturnStorageLocations(Loc);
+ StorageLocation *Loc3 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+ Loc, CE, Env, NopInit);
+
+ EXPECT_NE(Loc3, Loc1);
+ EXPECT_NE(Loc3, Loc1);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, ClearDifferentLocs) {
+ CommonTestInputs Inputs;
+ auto *CE = Inputs.CallRef;
+ RecordStorageLocation LocS1(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+ {});
+ RecordStorageLocation LocS2(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+ {});
+
+ LatticeT Lattice;
+ auto NopInit = [](StorageLocation &) {};
+ StorageLocation *RetLoc1 =
+ Lattice.getOrCreateConstMethodReturnStorageLocation(LocS1, CE, Env,
+ NopInit);
+ Lattice.clearConstMethodReturnStorageLocations(LocS2);
+ auto NotCalled = [](StorageLocation &) {
+ ASSERT_TRUE(false) << "Not reached";
+ };
+ StorageLocation *RetLoc2 =
+ Lattice.getOrCreateConstMethodReturnStorageLocation(LocS1, CE, Env,
+ NotCalled);
+
+ EXPECT_EQ(RetLoc1, RetLoc2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, DifferentValsFromDifferentLocs) {
+ TestAST AST(R"cpp(
+ struct S {
+ int *valProperty() const;
+ };
+ void target() {
+ S s1;
+ s1.valProperty();
+ S s2;
+ s2.valProperty();
+ }
+ )cpp");
+ auto *SDecl =
+ cast<CXXRecordDecl>(lookup("S", *AST.context().getTranslationUnitDecl()));
+ QualType SType = AST.context().getRecordType(SDecl);
+ SmallVector<BoundNodes, 1> valPropertyCalls =
+ match(cxxMemberCallExpr(callee(functionDecl(hasName("valProperty"))))
+ .bind("call"),
+ AST.context());
+ ASSERT_THAT(valPropertyCalls, SizeIs(2));
+
+ const CallExpr *CE1 = selectFirst<CallExpr>("call", valPropertyCalls);
+ ASSERT_NE(CE1, nullptr);
+
+ valPropertyCalls.erase(valPropertyCalls.begin());
+ const CallExpr *CE2 = selectFirst<CallExpr>("call", valPropertyCalls);
+ ASSERT_NE(CE2, nullptr);
+ ASSERT_NE(CE1, CE2);
+
+ RecordStorageLocation LocS1(SType, RecordStorageLocation::FieldToLoc(), {});
+ RecordStorageLocation LocS2(SType, RecordStorageLocation::FieldToLoc(), {});
+
+ LatticeT Lattice;
+ Value *Val1 = Lattice.getOrCreateConstMethodReturnValue(LocS1, CE1, Env);
+ Value *Val2 = Lattice.getOrCreateConstMethodReturnValue(LocS2, CE2, Env);
+
+ EXPECT_NE(Val1, Val2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, JoinSameNoop) {
+ CommonTestInputs Inputs;
+ auto *CE = Inputs.CallVal;
+ RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+ {});
+
+ LatticeT EmptyLattice;
+ LatticeT EmptyLattice2;
+ EXPECT_EQ(EmptyLattice.join(EmptyLattice2), LatticeJoinEffect::Unchanged);
+
+ LatticeT Lattice1;
+ Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+ EXPECT_EQ(Lattice1.join(Lattice1), LatticeJoinEffect::Unchanged);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, ProducesNewValueAfterJoinDistinct) {
+ CommonTestInputs Inputs;
+ auto *CE = Inputs.CallVal;
+ RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+ {});
+
+ // L1 w/ v vs L2 empty
+ LatticeT Lattice1;
+ Value *Val1 = Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+ LatticeT EmptyLattice;
+
+ EXPECT_EQ(Lattice1.join(EmptyLattice), LatticeJoinEffect::Changed);
+ Value *ValAfterJoin =
+ Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+ EXPECT_NE(ValAfterJoin, Val1);
+
+ // L1 w/ v1 vs L3 w/ v2
+ LatticeT Lattice3;
+ Value *Val3 = Lattice3.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+ EXPECT_EQ(Lattice1.join(Lattice3), LatticeJoinEffect::Changed);
+ Value *ValAfterJoin2 =
+ Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+ EXPECT_NE(ValAfterJoin2, ValAfterJoin);
+ EXPECT_NE(ValAfterJoin2, Val3);
+}
+
+} // namespace
+} // namespace clang::dataflow
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index 915e914..51e5709 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1033,15 +1033,15 @@ public:
// to expand Tablegen classes like 'Vector' which mean something different in
// each member of a parametric family.
const Type *getType(const Record *R, const Type *Param);
- const Type *getType(DagInit *D, const Type *Param);
- const Type *getType(Init *I, const Type *Param);
+ const Type *getType(const DagInit *D, const Type *Param);
+ const Type *getType(const Init *I, const Type *Param);
// Functions that translate the Tablegen representation of an intrinsic's
// code generation into a collection of Value objects (which will then be
// reprocessed to read out the actual C++ code included by CGBuiltin.cpp).
- Result::Ptr getCodeForDag(DagInit *D, const Result::Scope &Scope,
+ Result::Ptr getCodeForDag(const DagInit *D, const Result::Scope &Scope,
const Type *Param);
- Result::Ptr getCodeForDagArg(DagInit *D, unsigned ArgNum,
+ Result::Ptr getCodeForDagArg(const DagInit *D, unsigned ArgNum,
const Result::Scope &Scope, const Type *Param);
Result::Ptr getCodeForArg(unsigned ArgNum, const Type *ArgType, bool Promote,
bool Immediate);
@@ -1060,10 +1060,10 @@ public:
void EmitBuiltinAliases(raw_ostream &OS);
};
-const Type *EmitterBase::getType(Init *I, const Type *Param) {
- if (auto Dag = dyn_cast<DagInit>(I))
+const Type *EmitterBase::getType(const Init *I, const Type *Param) {
+ if (const auto *Dag = dyn_cast<DagInit>(I))
return getType(Dag, Param);
- if (auto Def = dyn_cast<DefInit>(I))
+ if (const auto *Def = dyn_cast<DefInit>(I))
return getType(Def->getDef(), Param);
PrintFatalError("Could not convert this value into a type");
@@ -1088,7 +1088,7 @@ const Type *EmitterBase::getType(const Record *R, const Type *Param) {
PrintFatalError(R->getLoc(), "Could not convert this record into a type");
}
-const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
+const Type *EmitterBase::getType(const DagInit *D, const Type *Param) {
// The meat of the getType system: types in the Tablegen are represented by a
// dag whose operators select sub-cases of this function.
@@ -1156,7 +1156,8 @@ const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
PrintFatalError("Bad operator in type dag expression");
}
-Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
+Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
+ const Result::Scope &Scope,
const Type *Param) {
const Record *Op = cast<DefInit>(D->getOperator())->getDef();
@@ -1199,14 +1200,14 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
Result::Ptr Arg = getCodeForDagArg(D, 0, Scope, Param);
const Type *Ty = nullptr;
- if (auto *DI = dyn_cast<DagInit>(D->getArg(0)))
+ if (const auto *DI = dyn_cast<DagInit>(D->getArg(0)))
if (auto *PTy = dyn_cast<PointerType>(getType(DI->getOperator(), Param)))
Ty = PTy->getPointeeType();
if (!Ty)
PrintFatalError("'address' pointer argument should be a pointer");
unsigned Alignment;
- if (auto *II = dyn_cast<IntInit>(D->getArg(1))) {
+ if (const auto *II = dyn_cast<IntInit>(D->getArg(1))) {
Alignment = II->getValue();
} else {
PrintFatalError("'address' alignment argument should be an integer");
@@ -1267,10 +1268,10 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
}
}
-Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
+Result::Ptr EmitterBase::getCodeForDagArg(const DagInit *D, unsigned ArgNum,
const Result::Scope &Scope,
const Type *Param) {
- Init *Arg = D->getArg(ArgNum);
+ const Init *Arg = D->getArg(ArgNum);
StringRef Name = D->getArgNameStr(ArgNum);
if (!Name.empty()) {
@@ -1286,18 +1287,18 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
// Sometimes the Arg is a bit. Prior to multiclass template argument
// checking, integers would sneak through the bit declaration,
// but now they really are bits.
- if (auto *BI = dyn_cast<BitInit>(Arg))
+ if (const auto *BI = dyn_cast<BitInit>(Arg))
return std::make_shared<IntLiteralResult>(getScalarType("u32"),
BI->getValue());
- if (auto *II = dyn_cast<IntInit>(Arg))
+ if (const auto *II = dyn_cast<IntInit>(Arg))
return std::make_shared<IntLiteralResult>(getScalarType("u32"),
II->getValue());
- if (auto *DI = dyn_cast<DagInit>(Arg))
+ if (const auto *DI = dyn_cast<DagInit>(Arg))
return getCodeForDag(DI, Scope, Param);
- if (auto *DI = dyn_cast<DefInit>(Arg)) {
+ if (const auto *DI = dyn_cast<DefInit>(Arg)) {
const Record *Rec = DI->getDef();
if (Rec->isSubClassOf("Type")) {
const Type *T = getType(Rec, Param);
@@ -1307,7 +1308,7 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
PrintError("bad DAG argument type for code generation");
PrintNote("DAG: " + D->getAsString());
- if (TypedInit *Typed = dyn_cast<TypedInit>(Arg))
+ if (const auto *Typed = dyn_cast<TypedInit>(Arg))
PrintNote("argument type: " + Typed->getType()->getAsString());
PrintFatalNote("argument number " + Twine(ArgNum) + ": " + Arg->getAsString());
}
@@ -1379,13 +1380,13 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
HeaderOnly = R->getValueAsBit("headerOnly");
// Process the intrinsic's argument list.
- DagInit *ArgsDag = R->getValueAsDag("args");
+ const DagInit *ArgsDag = R->getValueAsDag("args");
Result::Scope Scope;
for (unsigned i = 0, e = ArgsDag->getNumArgs(); i < e; ++i) {
- Init *TypeInit = ArgsDag->getArg(i);
+ const Init *TypeInit = ArgsDag->getArg(i);
bool Promote = true;
- if (auto TypeDI = dyn_cast<DefInit>(TypeInit))
+ if (const auto *TypeDI = dyn_cast<DefInit>(TypeInit))
if (TypeDI->getDef()->isSubClassOf("unpromoted"))
Promote = false;
@@ -1397,7 +1398,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
// If the argument is a subclass of Immediate, record the details about
// what values it can take, for Sema checking.
bool Immediate = false;
- if (auto TypeDI = dyn_cast<DefInit>(TypeInit)) {
+ if (const auto *TypeDI = dyn_cast<DefInit>(TypeInit)) {
const Record *TypeRec = TypeDI->getDef();
if (TypeRec->isSubClassOf("Immediate")) {
Immediate = true;
@@ -1444,7 +1445,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
// Finally, go through the codegen dag and translate it into a Result object
// (with an arbitrary DAG of depended-on Results hanging off it).
- DagInit *CodeDag = R->getValueAsDag("codegen");
+ const DagInit *CodeDag = R->getValueAsDag("codegen");
const Record *MainOp = cast<DefInit>(CodeDag->getOperator())->getDef();
if (MainOp->isSubClassOf("CustomCodegen")) {
// Or, if it's the special case of CustomCodegen, just accumulate
@@ -1456,9 +1457,9 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
StringRef Name = CodeDag->getArgNameStr(i);
if (Name.empty()) {
PrintFatalError("Operands to CustomCodegen should have names");
- } else if (auto *II = dyn_cast<IntInit>(CodeDag->getArg(i))) {
+ } else if (const auto *II = dyn_cast<IntInit>(CodeDag->getArg(i))) {
CustomCodeGenArgs[std::string(Name)] = itostr(II->getValue());
- } else if (auto *SI = dyn_cast<StringInit>(CodeDag->getArg(i))) {
+ } else if (const auto *SI = dyn_cast<StringInit>(CodeDag->getArg(i))) {
CustomCodeGenArgs[std::string(Name)] = std::string(SI->getValue());
} else {
PrintFatalError("Operands to CustomCodegen should be integers");
diff --git a/compiler-rt/lib/orc/dlfcn_wrapper.cpp b/compiler-rt/lib/orc/dlfcn_wrapper.cpp
index bbbc79f..dec8d1e 100644
--- a/compiler-rt/lib/orc/dlfcn_wrapper.cpp
+++ b/compiler-rt/lib/orc/dlfcn_wrapper.cpp
@@ -20,7 +20,7 @@ using namespace orc_rt;
extern "C" const char *__orc_rt_jit_dlerror();
extern "C" void *__orc_rt_jit_dlopen(const char *path, int mode);
-extern "C" int __orc_rt_jit_dlupdate(void *dso_handle, int mode);
+extern "C" int __orc_rt_jit_dlupdate(void *dso_handle);
extern "C" int __orc_rt_jit_dlclose(void *dso_handle);
ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
@@ -45,10 +45,10 @@ __orc_rt_jit_dlopen_wrapper(const char *ArgData, size_t ArgSize) {
#ifdef __APPLE__
ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
__orc_rt_jit_dlupdate_wrapper(const char *ArgData, size_t ArgSize) {
- return WrapperFunction<int32_t(SPSExecutorAddr, int32_t)>::handle(
+ return WrapperFunction<int32_t(SPSExecutorAddr)>::handle(
ArgData, ArgSize,
- [](ExecutorAddr &DSOHandle, int32_t mode) {
- return __orc_rt_jit_dlupdate(DSOHandle.toPtr<void *>(), mode);
+ [](ExecutorAddr &DSOHandle) {
+ return __orc_rt_jit_dlupdate(DSOHandle.toPtr<void *>());
})
.release();
}
diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp
index afd90c7..8ca6858 100644
--- a/compiler-rt/lib/orc/macho_platform.cpp
+++ b/compiler-rt/lib/orc/macho_platform.cpp
@@ -245,7 +245,7 @@ public:
const char *dlerror();
void *dlopen(std::string_view Name, int Mode);
- int dlupdate(void *DSOHandle, int Mode);
+ int dlupdate(void *DSOHandle);
int dlclose(void *DSOHandle);
void *dlsym(void *DSOHandle, const char *Symbol);
@@ -295,7 +295,7 @@ private:
Error dlopenInitialize(std::unique_lock<std::mutex> &JDStatesLock,
JITDylibState &JDS, MachOJITDylibDepInfoMap &DepInfo);
- Error dlupdateImpl(void *DSOHandle, int Mode);
+ Error dlupdateImpl(void *DSOHandle);
Error dlupdateFull(std::unique_lock<std::mutex> &JDStatesLock,
JITDylibState &JDS);
Error dlupdateInitialize(std::unique_lock<std::mutex> &JDStatesLock,
@@ -710,13 +710,13 @@ void *MachOPlatformRuntimeState::dlopen(std::string_view Path, int Mode) {
}
}
-int MachOPlatformRuntimeState::dlupdate(void *DSOHandle, int Mode) {
+int MachOPlatformRuntimeState::dlupdate(void *DSOHandle) {
ORC_RT_DEBUG({
std::string S;
printdbg("MachOPlatform::dlupdate(%p) (%s)\n", DSOHandle, S.c_str());
});
std::lock_guard<std::recursive_mutex> Lock(DyldAPIMutex);
- if (auto Err = dlupdateImpl(DSOHandle, Mode)) {
+ if (auto Err = dlupdateImpl(DSOHandle)) {
// FIXME: Make dlerror thread safe.
DLFcnError = toString(std::move(Err));
return -1;
@@ -1179,7 +1179,7 @@ Error MachOPlatformRuntimeState::dlopenInitialize(
return Error::success();
}
-Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle, int Mode) {
+Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle) {
std::unique_lock<std::mutex> Lock(JDStatesMutex);
// Try to find JITDylib state by DSOHandle.
@@ -1513,8 +1513,8 @@ void *__orc_rt_macho_jit_dlopen(const char *path, int mode) {
return MachOPlatformRuntimeState::get().dlopen(path, mode);
}
-int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode) {
- return MachOPlatformRuntimeState::get().dlupdate(dso_handle, mode);
+int __orc_rt_macho_jit_dlupdate(void *dso_handle) {
+ return MachOPlatformRuntimeState::get().dlupdate(dso_handle);
}
int __orc_rt_macho_jit_dlclose(void *dso_handle) {
diff --git a/compiler-rt/lib/orc/macho_platform.h b/compiler-rt/lib/orc/macho_platform.h
index ad70c97..aeab248 100644
--- a/compiler-rt/lib/orc/macho_platform.h
+++ b/compiler-rt/lib/orc/macho_platform.h
@@ -24,7 +24,7 @@ ORC_RT_INTERFACE void __orc_rt_macho_cxa_finalize(void *dso_handle);
// dlfcn functions.
ORC_RT_INTERFACE const char *__orc_rt_macho_jit_dlerror();
ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlopen(const char *path, int mode);
-ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode);
+ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle);
ORC_RT_INTERFACE int __orc_rt_macho_jit_dlclose(void *dso_handle);
ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlsym(void *dso_handle,
const char *symbol);
diff --git a/compiler-rt/lib/rtsan/rtsan_assertions.h b/compiler-rt/lib/rtsan/rtsan_assertions.h
index 8183a820..927b32e 100644
--- a/compiler-rt/lib/rtsan/rtsan_assertions.h
+++ b/compiler-rt/lib/rtsan/rtsan_assertions.h
@@ -28,6 +28,9 @@ void ExpectNotRealtime(Context &context, const DiagnosticsInfo &info,
if (context.InRealtimeContext() && !context.IsBypassed()) {
ScopedBypass sb{context};
+ if (IsFunctionSuppressed(info.func_name))
+ return;
+
__sanitizer::BufferedStackTrace stack;
// We use the unwind_on_fatal flag here because of precedent with other
diff --git a/compiler-rt/lib/rtsan/rtsan_checks.inc b/compiler-rt/lib/rtsan/rtsan_checks.inc
index f5f23e0..676b6a5 100644
--- a/compiler-rt/lib/rtsan/rtsan_checks.inc
+++ b/compiler-rt/lib/rtsan/rtsan_checks.inc
@@ -17,3 +17,4 @@
// SummaryKind should be a string literal.
RTSAN_CHECK(CallStackContains, "call-stack-contains")
+RTSAN_CHECK(FunctionNameMatches, "function-name-matches")
diff --git a/compiler-rt/lib/rtsan/rtsan_suppressions.cpp b/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
index c5051dd..a7c3d42a 100644
--- a/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
@@ -92,3 +92,16 @@ bool __rtsan::IsStackTraceSuppressed(const StackTrace &stack) {
}
return false;
}
+
+bool __rtsan::IsFunctionSuppressed(const char *function_name) {
+ if (suppression_ctx == nullptr)
+ return false;
+
+ const char *flag_name = ConvertTypeToFlagName(ErrorType::FunctionNameMatches);
+
+ if (!suppression_ctx->HasSuppressionType(flag_name))
+ return false;
+
+ Suppression *s;
+ return suppression_ctx->Match(function_name, flag_name, &s);
+}
diff --git a/compiler-rt/lib/rtsan/rtsan_suppressions.h b/compiler-rt/lib/rtsan/rtsan_suppressions.h
index 45545f8..9990b99 100644
--- a/compiler-rt/lib/rtsan/rtsan_suppressions.h
+++ b/compiler-rt/lib/rtsan/rtsan_suppressions.h
@@ -18,5 +18,6 @@ namespace __rtsan {
void InitializeSuppressions();
bool IsStackTraceSuppressed(const __sanitizer::StackTrace &stack);
+bool IsFunctionSuppressed(const char *function_name);
} // namespace __rtsan
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
index be577c3..bed19d1 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
@@ -78,6 +78,8 @@ TEST(SanitizerCommon, IsAccessibleMemoryRange) {
EXPECT_TRUE(IsAccessibleMemoryRange(mem + 2 * page_size, page_size));
EXPECT_FALSE(IsAccessibleMemoryRange(mem, 3 * page_size));
EXPECT_FALSE(IsAccessibleMemoryRange(0x0, 2));
+
+ munmap((void *)mem, 3 * page_size);
}
} // namespace __sanitizer
diff --git a/compiler-rt/test/profile/Posix/instrprof-visibility.cpp b/compiler-rt/test/profile/Posix/instrprof-visibility.cpp
index bb53305..016aaed 100644
--- a/compiler-rt/test/profile/Posix/instrprof-visibility.cpp
+++ b/compiler-rt/test/profile/Posix/instrprof-visibility.cpp
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
// RUN: %clangxx_profgen -fcoverage-mapping %S/Inputs/instrprof-visibility-helper.cpp -o %t %s
// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
// RUN: llvm-profdata merge %t.profraw -o %t.profdata
diff --git a/compiler-rt/test/profile/coverage-inline.cpp b/compiler-rt/test/profile/coverage-inline.cpp
index e362e56..a411436 100644
--- a/compiler-rt/test/profile/coverage-inline.cpp
+++ b/compiler-rt/test/profile/coverage-inline.cpp
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
// Test that the instrumentation puts the right linkage on the profile data for
// inline functions.
// RUN: %clang_profgen -g -fcoverage-mapping -c -o %t1.o %s -DOBJECT_1
diff --git a/compiler-rt/test/profile/coverage_comments.cpp b/compiler-rt/test/profile/coverage_comments.cpp
index d206fb6..8a99d64 100644
--- a/compiler-rt/test/profile/coverage_comments.cpp
+++ b/compiler-rt/test/profile/coverage_comments.cpp
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
// RUN: %clangxx_profgen -fcoverage-mapping -Wno-comment -o %t %s
// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
// RUN: llvm-profdata merge -o %t.profdata %t.profraw
diff --git a/compiler-rt/test/profile/coverage_emptylines.cpp b/compiler-rt/test/profile/coverage_emptylines.cpp
index 8610d70..8006cde 100644
--- a/compiler-rt/test/profile/coverage_emptylines.cpp
+++ b/compiler-rt/test/profile/coverage_emptylines.cpp
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
// Remove comments first.
// RUN: sed 's/[ \t]*\/\/.*//' %s > %t.stripped.cpp
// RUN: %clangxx_profgen -fcoverage-mapping -o %t %t.stripped.cpp
diff --git a/compiler-rt/test/profile/instrprof-merging.cpp b/compiler-rt/test/profile/instrprof-merging.cpp
index 6212feb..4a3f14b 100644
--- a/compiler-rt/test/profile/instrprof-merging.cpp
+++ b/compiler-rt/test/profile/instrprof-merging.cpp
@@ -1,4 +1,5 @@
// UNSUPPORTED: target={{.*windows.*}}
+// XFAIL: target={{.*}}-aix{{.*}}
// 1) Compile shared code into different object files and into an executable.
// RUN: %clangxx_profgen -std=c++14 -fcoverage-mapping %s -c -o %t.v1.o \
diff --git a/compiler-rt/test/profile/instrprof-set-file-object-merging.c b/compiler-rt/test/profile/instrprof-set-file-object-merging.c
index 92f5f92..baabb21 100644
--- a/compiler-rt/test/profile/instrprof-set-file-object-merging.c
+++ b/compiler-rt/test/profile/instrprof-set-file-object-merging.c
@@ -24,6 +24,7 @@ int main(int argc, const char *argv[]) {
return 0;
}
+// XFAIL: target={{.*}}-aix{{.*}}
// CHECK: 10| |#include <stdio.h>
// CHECK: 11| |
// CHECK: 12| |extern void __llvm_profile_set_file_object(FILE *, int);
diff --git a/compiler-rt/test/profile/instrprof-set-file-object.c b/compiler-rt/test/profile/instrprof-set-file-object.c
index 280374a..0d1f96d 100644
--- a/compiler-rt/test/profile/instrprof-set-file-object.c
+++ b/compiler-rt/test/profile/instrprof-set-file-object.c
@@ -17,6 +17,7 @@ int main(int argc, const char *argv[]) {
__llvm_profile_set_file_object(F, 0);
return 0;
}
+// XFAIL: target={{.*}}-aix{{.*}}
// CHECK: 8| |#include <stdio.h>
// CHECK: 9| |
// CHECK: 10| |extern void __llvm_profile_set_file_object(FILE *, int);
diff --git a/compiler-rt/test/profile/instrprof-without-libc.c b/compiler-rt/test/profile/instrprof-without-libc.c
index 3142138..d0d213b 100644
--- a/compiler-rt/test/profile/instrprof-without-libc.c
+++ b/compiler-rt/test/profile/instrprof-without-libc.c
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
// RUN: %clang_profgen -DCHECK_SYMBOLS -O3 -o %t.symbols %s
// RUN: llvm-nm %t.symbols | FileCheck %s --check-prefix=CHECK-SYMBOLS
// RUN: %clang_profgen -O3 -o %t %s
diff --git a/compiler-rt/test/profile/instrprof-write-file-only.c b/compiler-rt/test/profile/instrprof-write-file-only.c
index f505cf6..5edad27 100644
--- a/compiler-rt/test/profile/instrprof-write-file-only.c
+++ b/compiler-rt/test/profile/instrprof-write-file-only.c
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
// RUN: %clang_profgen -o %t -O3 %s
// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
// RUN: llvm-profdata merge -o %t.profdata %t.profraw
diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py
index 3b3019a..c8c78a7 100644
--- a/compiler-rt/test/profile/lit.cfg.py
+++ b/compiler-rt/test/profile/lit.cfg.py
@@ -77,12 +77,8 @@ def exclude_unsupported_files_for_aix(dirname):
f = open(source_path, "r")
try:
data = f.read()
- # -fprofile-instr-generate and rpath are not supported on AIX, exclude all tests with them.
- if (
- "%clang_profgen" in data
- or "%clangxx_profgen" in data
- or "-rpath" in data
- ):
+ # rpath is not supported on AIX, exclude all tests with them.
+ if ( "-rpath" in data ):
config.excludes += [filename]
finally:
f.close()
diff --git a/compiler-rt/test/rtsan/stack_suppressions.cpp b/compiler-rt/test/rtsan/stack_suppressions.cpp
index 2aceedb..b9b2d09 100644
--- a/compiler-rt/test/rtsan/stack_suppressions.cpp
+++ b/compiler-rt/test/rtsan/stack_suppressions.cpp
@@ -1,4 +1,5 @@
// RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: %env_rtsan_opts=halt_on_error=false %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NOSUPPRESSIONS
// RUN: %env_rtsan_opts=suppressions='%s.supp' not %run %t 2>&1 | FileCheck %s
// UNSUPPORTED: ios
@@ -8,8 +9,11 @@
#include <stdlib.h>
#include <unistd.h>
+#include <atomic>
#include <vector>
+std::atomic<int> cas_atomic{0};
+
void *MallocViolation() { return malloc(10); }
void VectorViolations() {
@@ -22,13 +26,18 @@ void VectorViolations() {
v.reserve(10);
}
-void BlockFunc() [[clang::blocking]] { usleep(1); }
+void BlockFunc() [[clang::blocking]] {
+ int expected = 0;
+ while (!cas_atomic.compare_exchange_weak(expected, 1)) {
+ expected = cas_atomic.load();
+ }
+}
void *process() [[clang::nonblocking]] {
- void *ptr = MallocViolation();
- VectorViolations();
- BlockFunc();
- free(ptr);
+ void *ptr = MallocViolation(); // Suppressed call-stack-contains
+ VectorViolations(); // Suppressed call-stack-contains with regex
+ BlockFunc(); // Suppressed function-name-matches
+ free(ptr); // Suppressed function-name-matches
// This is the one that should abort the program
// Everything else is suppressed
@@ -51,3 +60,9 @@ int main() {
// CHECK-NOT: vector
// CHECK-NOT: free
// CHECK-NOT: BlockFunc
+
+// CHECK-NOSUPPRESSIONS: malloc
+// CHECK-NOSUPPRESSIONS: vector
+// CHECK-NOSUPPRESSIONS: free
+// CHECK-NOSUPPRESSIONS: BlockFunc
+// CHECK-NOSUPPRESSIONS: usleep
diff --git a/compiler-rt/test/rtsan/stack_suppressions.cpp.supp b/compiler-rt/test/rtsan/stack_suppressions.cpp.supp
index bec4db2..9aaa5a5 100644
--- a/compiler-rt/test/rtsan/stack_suppressions.cpp.supp
+++ b/compiler-rt/test/rtsan/stack_suppressions.cpp.supp
@@ -1,4 +1,5 @@
call-stack-contains:MallocViolation
call-stack-contains:std::*vector
-call-stack-contains:free
-call-stack-contains:BlockFunc
+
+function-name-matches:free
+function-name-matches:Block*
diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp
index ee4df2d..bb439a6 100644
--- a/flang/lib/Evaluate/intrinsics-library.cpp
+++ b/flang/lib/Evaluate/intrinsics-library.cpp
@@ -417,7 +417,7 @@ template <> struct HostRuntimeLibrary<double, LibraryVersion::LibmExtensions> {
static_assert(map.Verify(), "map must be sorted");
};
-#if HAS_FLOAT80 || HAS_LDBL128
+#if defined(__GLIBC__) && (HAS_FLOAT80 || HAS_LDBL128)
template <>
struct HostRuntimeLibrary<long double, LibraryVersion::LibmExtensions> {
using F = FuncPointer<long double, long double>;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 70d89f5..cf46900 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2070,7 +2070,9 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
loopNestClauseOps, iv);
EntryBlockArgs simdArgs;
- // TODO: Add private, reduction syms and vars.
+ // TODO: Add private syms and vars.
+ simdArgs.reduction.syms = simdReductionSyms;
+ simdArgs.reduction.vars = simdClauseOps.reductionVars;
auto simdOp =
genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
@@ -2228,7 +2230,9 @@ static void genCompositeDistributeParallelDoSimd(
wsloopOp.setComposite(/*val=*/true);
EntryBlockArgs simdArgs;
- // TODO: Add private, reduction syms and vars.
+ // TODO: Add private syms and vars.
+ simdArgs.reduction.syms = simdReductionSyms;
+ simdArgs.reduction.vars = simdClauseOps.reductionVars;
auto simdOp =
genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
simdOp.setComposite(/*val=*/true);
@@ -2285,7 +2289,9 @@ static void genCompositeDistributeSimd(lower::AbstractConverter &converter,
distributeOp.setComposite(/*val=*/true);
EntryBlockArgs simdArgs;
- // TODO: Add private, reduction syms and vars.
+ // TODO: Add private syms and vars.
+ simdArgs.reduction.syms = simdReductionSyms;
+ simdArgs.reduction.vars = simdClauseOps.reductionVars;
auto simdOp =
genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
simdOp.setComposite(/*val=*/true);
@@ -2342,7 +2348,9 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
wsloopOp.setComposite(/*val=*/true);
EntryBlockArgs simdArgs;
- // TODO: Add private, reduction syms and vars.
+ // TODO: Add private syms and vars.
+ simdArgs.reduction.syms = simdReductionSyms;
+ simdArgs.reduction.vars = simdClauseOps.reductionVars;
auto simdOp =
genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
simdOp.setComposite(/*val=*/true);
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 90a983b..4600c72 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -187,9 +187,6 @@ DEFINE_SIMPLE_ALIAS(Hypot, std::hypot)
DEFINE_SIMPLE_ALIAS(Ilogb, std::ilogb)
DEFINE_SIMPLE_ALIAS(Isinf, std::isinf)
DEFINE_SIMPLE_ALIAS(Isnan, std::isnan)
-DEFINE_SIMPLE_ALIAS(J0, j0l)
-DEFINE_SIMPLE_ALIAS(J1, j1l)
-DEFINE_SIMPLE_ALIAS(Jn, jnl)
DEFINE_SIMPLE_ALIAS(Ldexp, std::ldexp)
DEFINE_SIMPLE_ALIAS(Lgamma, std::lgamma)
DEFINE_SIMPLE_ALIAS(Llround, std::llround)
@@ -207,9 +204,15 @@ DEFINE_SIMPLE_ALIAS(Tan, std::tan)
DEFINE_SIMPLE_ALIAS(Tanh, std::tanh)
DEFINE_SIMPLE_ALIAS(Tgamma, std::tgamma)
DEFINE_SIMPLE_ALIAS(Trunc, std::trunc)
+
+#if defined(__GLIBC__) && defined(_GNU_SOURCE)
+DEFINE_SIMPLE_ALIAS(J0, j0l)
+DEFINE_SIMPLE_ALIAS(J1, j1l)
+DEFINE_SIMPLE_ALIAS(Jn, jnl)
DEFINE_SIMPLE_ALIAS(Y0, y0l)
DEFINE_SIMPLE_ALIAS(Y1, y1l)
DEFINE_SIMPLE_ALIAS(Yn, ynl)
+#endif
// Use numeric_limits to produce infinity of the right type.
#define F128_RT_INFINITY \
diff --git a/flang/test/Driver/atomic.f90 b/flang/test/Driver/atomic.f90
new file mode 100644
index 0000000..0fb3b42
--- /dev/null
+++ b/flang/test/Driver/atomic.f90
@@ -0,0 +1,5 @@
+!RUN: %flang --target=aarch64-unknown-linux-gnu -fuse-ld=ld -fopenmp -rtlib=libgcc -### %s 2>&1 | FileCheck --check-prefixes=GCC %s
+!RUN: %flang --target=aarch64-unknown-linux-gnu -fuse-ld=ld -fopenmp -rtlib=compiler-rt -### %s 2>&1 | FileCheck --check-prefixes=CRT %s
+
+!GCC: -latomic
+!CRT-NOT: -latomic
diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
new file mode 100644
index 0000000..3aa5d04
--- /dev/null
+++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -0,0 +1,262 @@
+! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s
+
+! Combinational testing of control flow graph and builder insertion points
+! in mlir-to-llvm conversion:
+! - mixing multiple delayed privatizations and multiple reductions
+! - multiple blocks in the private alloc region
+! - private alloc region has to read from the mold variable
+! - firstprivate
+! - multiple blocks in the private copy region
+! - multiple blocks in the reduction init region
+! - reduction init region has to read from the mold variable
+! - re-used omp.private ops
+! - re-used omp.reduction.declare ops
+! - unstructured code inside of the parallel region
+! - needs private dealloc region, and this has multiple blocks
+! - needs reduction cleanup region, and this has multiple blocks
+
+! This maybe belongs in the mlir tests, but what we are doing here is complex
+! enough that I find the kind of minimised mlir code preferred by mlir reviewers
+! hard to read without some fortran here for reference. Nothing like this would
+! be generated by other upstream users of the MLIR OpenMP dialect.
+
+subroutine worst_case(a, b, c, d)
+ real, allocatable :: a(:), b(:), c(:), d(:)
+ integer i
+
+ !$omp parallel firstprivate(a,b) reduction(+:c,d)
+ if (sum(a) == 1) stop 1
+ !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: define internal void @worst_case_..omp_par
+! CHECK-NEXT: omp.par.entry:
+! [reduction alloc regions inlined here]
+! CHECK: br label %omp.private.latealloc
+
+! CHECK: omp.private.latealloc: ; preds = %omp.par.entry
+! CHECK-NEXT: br label %omp.private.alloc5
+
+! CHECK: omp.private.alloc5: ; preds = %omp.private.latealloc
+! [begin private alloc for first var]
+! [read the length from the mold argument]
+! [if it is non-zero...]
+! CHECK: br i1 {{.*}}, label %omp.private.alloc6, label %omp.private.alloc7
+
+! CHECK: omp.private.alloc7: ; preds = %omp.private.alloc5
+! [finish private alloc for first var with zero extent]
+! CHECK: br label %omp.private.alloc8
+
+! CHECK: omp.private.alloc8: ; preds = %omp.private.alloc6, %omp.private.alloc7
+! CHECK-NEXT: br label %omp.region.cont4
+
+! CHECK: omp.region.cont4: ; preds = %omp.private.alloc8
+! CHECK-NEXT: %{{.*}} = phi ptr
+! CHECK-NEXT: br label %omp.private.alloc
+
+! CHECK: omp.private.alloc: ; preds = %omp.region.cont4
+! [begin private alloc for first var]
+! [read the length from the mold argument]
+! [if it is non-zero...]
+! CHECK: br i1 %{{.*}}, label %omp.private.alloc1, label %omp.private.alloc2
+
+! CHECK: omp.private.alloc2: ; preds = %omp.private.alloc
+! [finish private alloc for second var with zero extent]
+! CHECK: br label %omp.private.alloc3
+
+! CHECK: omp.private.alloc3: ; preds = %omp.private.alloc1, %omp.private.alloc2
+! CHECK-NEXT: br label %omp.region.cont
+
+! CHECK: omp.region.cont: ; preds = %omp.private.alloc3
+! CHECK-NEXT: %{{.*}} = phi ptr
+! CHECK-NEXT: br label %omp.private.copy
+
+! CHECK: omp.private.copy: ; preds = %omp.region.cont
+! CHECK-NEXT: br label %omp.private.copy10
+
+! CHECK: omp.private.copy10: ; preds = %omp.private.copy
+! [begin firstprivate copy for first var]
+! [read the length, is it non-zero?]
+! CHECK: br i1 %{{.*}}, label %omp.private.copy11, label %omp.private.copy12
+
+! CHECK: omp.private.copy12: ; preds = %omp.private.copy11, %omp.private.copy10
+! CHECK-NEXT: br label %omp.region.cont9
+
+! CHECK: omp.region.cont9: ; preds = %omp.private.copy12
+! CHECK-NEXT: %{{.*}} = phi ptr
+! CHECK-NEXT: br label %omp.private.copy14
+
+! CHECK: omp.private.copy14: ; preds = %omp.region.cont9
+! [begin firstprivate copy for second var]
+! [read the length, is it non-zero?]
+! CHECK: br i1 %{{.*}}, label %omp.private.copy15, label %omp.private.copy16
+
+! CHECK: omp.private.copy16: ; preds = %omp.private.copy15, %omp.private.copy14
+! CHECK-NEXT: br label %omp.region.cont13
+
+! CHECK: omp.region.cont13: ; preds = %omp.private.copy16
+! CHECK-NEXT: %{{.*}} = phi ptr
+! CHECK-NEXT: br label %omp.reduction.init
+
+! CHECK: omp.reduction.init: ; preds = %omp.region.cont13
+! [deffered stores for results of reduction alloc regions]
+! CHECK: br label %[[VAL_96:.*]]
+
+! CHECK: omp.reduction.neutral: ; preds = %omp.reduction.init
+! [start of reduction initialization region]
+! [null check:]
+! CHECK: br i1 %{{.*}}, label %omp.reduction.neutral18, label %omp.reduction.neutral19
+
+! CHECK: omp.reduction.neutral19: ; preds = %omp.reduction.neutral
+! [malloc and assign the default value to the reduction variable]
+! CHECK: br label %omp.reduction.neutral20
+
+! CHECK: omp.reduction.neutral20: ; preds = %omp.reduction.neutral18, %omp.reduction.neutral19
+! CHECK-NEXT: br label %omp.region.cont17
+
+! CHECK: omp.region.cont17: ; preds = %omp.reduction.neutral20
+! CHECK-NEXT: %{{.*}} = phi ptr
+! CHECK-NEXT: br label %omp.reduction.neutral22
+
+! CHECK: omp.reduction.neutral22: ; preds = %omp.region.cont17
+! [start of reduction initialization region]
+! [null check:]
+! CHECK: br i1 %{{.*}}, label %omp.reduction.neutral23, label %omp.reduction.neutral24
+
+! CHECK: omp.reduction.neutral24: ; preds = %omp.reduction.neutral22
+! [malloc and assign the default value to the reduction variable]
+! CHECK: br label %omp.reduction.neutral25
+
+! CHECK: omp.reduction.neutral25: ; preds = %omp.reduction.neutral23, %omp.reduction.neutral24
+! CHECK-NEXT: br label %omp.region.cont21
+
+! CHECK: omp.region.cont21: ; preds = %omp.reduction.neutral25
+! CHECK-NEXT: %{{.*}} = phi ptr
+! CHECK-NEXT: br label %omp.par.region
+
+! CHECK: omp.par.region: ; preds = %omp.region.cont21
+! CHECK-NEXT: br label %omp.par.region27
+
+! CHECK: omp.par.region27: ; preds = %omp.par.region
+! [call SUM runtime function]
+! [if (sum(a) == 1)]
+! CHECK: br i1 %{{.*}}, label %omp.par.region28, label %omp.par.region29
+
+! CHECK: omp.par.region29: ; preds = %omp.par.region27
+! CHECK-NEXT: br label %omp.region.cont26
+
+! CHECK: omp.region.cont26: ; preds = %omp.par.region28, %omp.par.region29
+! [omp parallel region done, call into the runtime to complete reduction]
+! CHECK: %[[VAL_233:.*]] = call i32 @__kmpc_reduce(
+! CHECK: switch i32 %[[VAL_233]], label %reduce.finalize [
+! CHECK-NEXT: i32 1, label %reduce.switch.nonatomic
+! CHECK-NEXT: i32 2, label %reduce.switch.atomic
+! CHECK-NEXT: ]
+
+! CHECK: reduce.switch.atomic: ; preds = %omp.region.cont26
+! CHECK-NEXT: unreachable
+
+! CHECK: reduce.switch.nonatomic: ; preds = %omp.region.cont26
+! CHECK-NEXT: %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8
+! CHECK-NEXT: br label %omp.reduction.nonatomic.body
+
+! [various blocks implementing the reduction]
+
+! CHECK: omp.region.cont35: ; preds =
+! CHECK-NEXT: %{{.*}} = phi ptr
+! CHECK-NEXT: call void @__kmpc_end_reduce(
+! CHECK-NEXT: br label %reduce.finalize
+
+! CHECK: reduce.finalize: ; preds =
+! CHECK-NEXT: br label %omp.par.pre_finalize
+
+! CHECK: omp.par.pre_finalize: ; preds = %reduce.finalize
+! CHECK-NEXT: %{{.*}} = load ptr, ptr
+! CHECK-NEXT: br label %omp.reduction.cleanup
+
+! CHECK: omp.reduction.cleanup: ; preds = %omp.par.pre_finalize
+! [null check]
+! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup41, label %omp.reduction.cleanup42
+
+! CHECK: omp.reduction.cleanup42: ; preds = %omp.reduction.cleanup41, %omp.reduction.cleanup
+! CHECK-NEXT: br label %omp.region.cont40
+
+! CHECK: omp.region.cont40: ; preds = %omp.reduction.cleanup42
+! CHECK-NEXT: %{{.*}} = load ptr, ptr
+! CHECK-NEXT: br label %omp.reduction.cleanup44
+
+! CHECK: omp.reduction.cleanup44: ; preds = %omp.region.cont40
+! [null check]
+! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup45, label %omp.reduction.cleanup46
+
+! CHECK: omp.reduction.cleanup46: ; preds = %omp.reduction.cleanup45, %omp.reduction.cleanup44
+! CHECK-NEXT: br label %omp.region.cont43
+
+! CHECK: omp.region.cont43: ; preds = %omp.reduction.cleanup46
+! CHECK-NEXT: br label %omp.private.dealloc
+
+! CHECK: omp.private.dealloc: ; preds = %omp.region.cont43
+! [null check]
+! CHECK: br i1 %{{.*}}, label %omp.private.dealloc48, label %omp.private.dealloc49
+
+! CHECK: omp.private.dealloc49: ; preds = %omp.private.dealloc48, %omp.private.dealloc
+! CHECK-NEXT: br label %omp.region.cont47
+
+! CHECK: omp.region.cont47: ; preds = %omp.private.dealloc49
+! CHECK-NEXT: br label %omp.private.dealloc51
+
+! CHECK: omp.private.dealloc51: ; preds = %omp.region.cont47
+! [null check]
+! CHECK: br i1 %{{.*}}, label %omp.private.dealloc52, label %omp.private.dealloc53
+
+! CHECK: omp.private.dealloc53: ; preds = %omp.private.dealloc52, %omp.private.dealloc51
+! CHECK-NEXT: br label %omp.region.cont50
+
+! CHECK: omp.region.cont50: ; preds = %omp.private.dealloc53
+! CHECK-NEXT: br label %omp.par.outlined.exit.exitStub
+
+! CHECK: omp.private.dealloc52: ; preds = %omp.private.dealloc51
+! [dealloc memory]
+! CHECK: br label %omp.private.dealloc53
+
+! CHECK: omp.private.dealloc48: ; preds = %omp.private.dealloc
+! [dealloc memory]
+! CHECK: br label %omp.private.dealloc49
+
+! CHECK: omp.reduction.cleanup45: ; preds = %omp.reduction.cleanup44
+! CHECK-NEXT: call void @free(
+! CHECK-NEXT: br label %omp.reduction.cleanup46
+
+! CHECK: omp.reduction.cleanup41: ; preds = %omp.reduction.cleanup
+! CHECK-NEXT: call void @free(
+! CHECK-NEXT: br label %omp.reduction.cleanup42
+
+! CHECK: omp.par.region28: ; preds = %omp.par.region27
+! CHECK-NEXT: call {} @_FortranAStopStatement
+
+! CHECK: omp.reduction.neutral23: ; preds = %omp.reduction.neutral22
+! [source length was zero: finish initializing array]
+! CHECK: br label %omp.reduction.neutral25
+
+! CHECK: omp.reduction.neutral18: ; preds = %omp.reduction.neutral
+! [source length was zero: finish initializing array]
+! CHECK: br label %omp.reduction.neutral20
+
+! CHECK: omp.private.copy15: ; preds = %omp.private.copy14
+! [source length was non-zero: call assign runtime]
+! CHECK: br label %omp.private.copy16
+
+! CHECK: omp.private.copy11: ; preds = %omp.private.copy10
+! [source length was non-zero: call assign runtime]
+! CHECK: br label %omp.private.copy12
+
+! CHECK: omp.private.alloc1: ; preds = %omp.private.alloc
+! [var extent was non-zero: malloc a private array]
+! CHECK: br label %omp.private.alloc3
+
+! CHECK: omp.private.alloc6: ; preds = %omp.private.alloc5
+! [var extent was non-zero: malloc a private array]
+! CHECK: br label %omp.private.alloc8
+
+! CHECK: omp.par.outlined.exit.exitStub: ; preds = %omp.region.cont50
+! CHECK-NEXT: ret void
diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90
new file mode 100644
index 0000000..62d0a3f
--- /dev/null
+++ b/flang/test/Integration/OpenMP/private-global.f90
@@ -0,0 +1,46 @@
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
+
+! Regression test for https://github.com/llvm/llvm-project/issues/106297
+
+program bug
+ implicit none
+ integer :: table(10)
+ !$OMP PARALLEL PRIVATE(table)
+ table = 50
+ if (any(table/=50)) then
+ stop 'fail 3'
+ end if
+ !$OMP END PARALLEL
+ print *,'ok'
+End Program
+
+
+! CHECK-LABEL: define internal void {{.*}}..omp_par(
+! CHECK: omp.par.entry:
+! CHECK: %[[VAL_9:.*]] = alloca i32, align 4
+! CHECK: %[[VAL_10:.*]] = load i32, ptr %[[VAL_11:.*]], align 4
+! CHECK: store i32 %[[VAL_10]], ptr %[[VAL_9]], align 4
+! CHECK: %[[VAL_12:.*]] = load i32, ptr %[[VAL_9]], align 4
+! CHECK: %[[PRIV_TABLE:.*]] = alloca [10 x i32], i64 1, align 4
+! ...
+! check that we use the private copy of table for the assignment
+! CHECK: omp.par.region1:
+! CHECK: %[[ELEMENTAL_TMP:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK: %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK: %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+! CHECK: %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+! CHECK: %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0
+! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8
+! CHECK: %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8
+! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8
+! CHECK: %[[VAL_26:.*]] = call {} @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9)
+! ...
+! check that we use the private copy of table for table/=50
+! CHECK: omp.par.region3:
+! CHECK: %[[VAL_44:.*]] = sub nsw i64 %{{.*}}, 1
+! CHECK: %[[VAL_45:.*]] = mul nsw i64 %[[VAL_44]], 1
+! CHECK: %[[VAL_46:.*]] = mul nsw i64 %[[VAL_45]], 1
+! CHECK: %[[VAL_47:.*]] = add nsw i64 %[[VAL_46]], 0
+! CHECK: %[[VAL_48:.*]] = getelementptr i32, ptr %[[PRIV_TABLE]], i64 %[[VAL_47]]
+! CHECK: %[[VAL_49:.*]] = load i32, ptr %[[VAL_48]], align 4
+! CHECK: %[[VAL_50:.*]] = icmp ne i32 %[[VAL_49]], 50
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index f574a12..d92f06c 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -4,6 +4,8 @@
! RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
! RUN: bbc -hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+!CHECK: omp.declare_reduction @[[REDUCER:.*]] : i32
+
!CHECK-LABEL: func @_QPsimd()
subroutine simd
integer :: i
@@ -273,3 +275,25 @@ subroutine lastprivate_with_simd
sum = i + 1
end do
end subroutine
+
+!CHECK-LABEL: func @_QPsimd_with_reduction_clause()
+subroutine simd_with_reduction_clause
+ integer :: i, x
+ x = 0
+ ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
+ ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
+ ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
+ ! CHECK-NEXT: omp.simd reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
+ ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+ !$omp simd reduction(+:x)
+ do i=1, 9
+ ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_RED]] {uniq_name = "_QFsimd_with_reduction_clauseEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+ ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
+ ! CHECK: %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+ ! CHECK: %[[I_LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
+ ! CHECK: %[[SUM:.*]] = arith.addi %[[X_LD]], %[[I_LD]] : i32
+ ! CHECK: hlfir.assign %[[SUM]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+ x = x+i
+ end do
+ !$OMP end simd
+end subroutine
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index b4cfe47..251ad43 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -522,6 +522,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
libc.src.math.ceilf16
libc.src.math.copysignf16
libc.src.math.exp10f16
+ libc.src.math.exp10m1f16
libc.src.math.exp2f16
libc.src.math.expf16
libc.src.math.f16add
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 2589da3..3ca14ec 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -611,6 +611,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
libc.src.math.ceilf16
libc.src.math.copysignf16
libc.src.math.exp10f16
+ libc.src.math.exp10m1f16
libc.src.math.exp2f16
libc.src.math.exp2m1f16
libc.src.math.expf16
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 72e8f66..95ac7f4 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -292,7 +292,7 @@ Higher Math Functions
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
| exp10 | |check| | |check| | | |check| | | 7.12.6.2 | F.10.3.2 |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| exp10m1 | | | | | | 7.12.6.3 | F.10.3.3 |
+| exp10m1 | | | | |check| | | 7.12.6.3 | F.10.3.3 |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
| exp2 | |check| | |check| | | |check| | | 7.12.6.4 | F.10.3.4 |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 1b25569..ea032ba 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -692,6 +692,8 @@ def StdC : StandardSpec<"stdc"> {
FunctionSpec<"exp10f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
GuardedFunctionSpec<"exp10f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
+ GuardedFunctionSpec<"exp10m1f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
+
FunctionSpec<"remainder", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
FunctionSpec<"remainderf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
FunctionSpec<"remainderl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 7803369..ecf6396 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -127,6 +127,8 @@ add_math_entrypoint_object(exp10)
add_math_entrypoint_object(exp10f)
add_math_entrypoint_object(exp10f16)
+add_math_entrypoint_object(exp10m1f16)
+
add_math_entrypoint_object(expm1)
add_math_entrypoint_object(expm1f)
add_math_entrypoint_object(expm1f16)
diff --git a/libc/src/math/exp10m1f16.h b/libc/src/math/exp10m1f16.h
new file mode 100644
index 0000000..e195bc4
--- /dev/null
+++ b/libc/src/math/exp10m1f16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for exp10m1f16 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_EXP10M1F16_H
+#define LLVM_LIBC_SRC_MATH_EXP10M1F16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 exp10m1f16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_EXP10M1F16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 1ad611f..ffa7497 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1657,6 +1657,29 @@ add_entrypoint_object(
)
add_entrypoint_object(
+ exp10m1f16
+ SRCS
+ exp10m1f16.cpp
+ HDRS
+ ../exp10m1f16.h
+ DEPENDS
+ .expxf16
+ libc.hdr.errno_macros
+ libc.hdr.fenv_macros
+ libc.src.__support.FPUtil.cast
+ libc.src.__support.FPUtil.except_value_utils
+ libc.src.__support.FPUtil.fenv_impl
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.polyeval
+ libc.src.__support.FPUtil.rounding_mode
+ libc.src.__support.macros.optimization
+ libc.src.__support.macros.properties.cpu_features
+ COMPILE_OPTIONS
+ -O3
+)
+
+add_entrypoint_object(
expm1
SRCS
expm1.cpp
diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp
index 1c5966c..f7a8ee3 100644
--- a/libc/src/math/generic/exp10f16.cpp
+++ b/libc/src/math/generic/exp10f16.cpp
@@ -54,16 +54,6 @@ static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
#endif
}};
-// Generated by Sollya with the following commands:
-// > display = hexadecimal;
-// > round(log2(10), SG, RN);
-static constexpr float LOG2F_10 = 0x1.a934fp+1f;
-
-// Generated by Sollya with the following commands:
-// > display = hexadecimal;
-// > round(log10(2), SG, RN);
-static constexpr float LOG10F_2 = 0x1.344136p-2f;
-
LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) {
using FPBits = fputil::FPBits<float16>;
FPBits x_bits(x);
@@ -132,40 +122,9 @@ LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) {
if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
return r.value();
- // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
- // find hi, mid, lo, such that:
- // x = (hi + mid) * log2(10) + lo, in which
- // hi is an integer,
- // mid * 2^3 is an integer,
- // -2^(-4) <= lo < 2^(-4).
- // In particular,
- // hi + mid = round(x * 2^3) * 2^(-3).
- // Then,
- // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
- // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
- // by adding hi to the exponent field of 2^mid. 10^lo is computed using a
- // degree-4 minimax polynomial generated by Sollya.
-
- float xf = x;
- float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
- int x_hi_mid = static_cast<int>(kf);
- int x_hi = x_hi_mid >> 3;
- int x_mid = x_hi_mid & 0x7;
- // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
- float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
-
- uint32_t exp2_hi_mid_bits =
- EXP2_MID_BITS[x_mid] +
- static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
- float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
- // Degree-4 minimax polynomial generated by Sollya with the following
- // commands:
- // > display = hexadecimal;
- // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
- // > 1 + x * P;
- float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
- 0x1.04b434p+1f, 0x1.2bcf9ep+0f);
- return fputil::cast<float16>(exp2_hi_mid * exp10_lo);
+ // 10^x = 2^((hi + mid) * log2(10)) * 10^lo
+ auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+ return static_cast<float16>(exp2_hi_mid * exp10_lo);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
new file mode 100644
index 0000000..9f2c195
--- /dev/null
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -0,0 +1,163 @@
+//===-- Half-precision 10^x - 1 function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10m1f16.h"
+#include "expxf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
+ // (input, RZ output, RU offset, RD offset, RN offset)
+ // x = 0x1.5c4p-4, exp10m1f16(x) = 0x1.bacp-3 (RZ)
+ {0x2d71U, 0x32ebU, 1U, 0U, 0U},
+ // x = -0x1.5ep-13, exp10m1f16(x) = -0x1.92cp-12 (RZ)
+ {0x8978U, 0x8e4bU, 0U, 1U, 0U},
+ // x = -0x1.e2p-10, exp10m1f16(x) = -0x1.14cp-8 (RZ)
+ {0x9788U, 0x9c53U, 0U, 1U, 0U},
+}};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
+#else
+static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
+#endif
+
+static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
+ EXP10M1F16_EXCEPTS_HI = {{
+ // (input, RZ output, RU offset, RD offset, RN offset)
+ // x = 0x1.8f4p-2, exp10m1f16(x) = 0x1.744p+0 (RZ)
+ {0x363dU, 0x3dd1U, 1U, 0U, 0U},
+ // x = 0x1.95cp-2, exp10m1f16(x) = 0x1.7d8p+0 (RZ)
+ {0x3657U, 0x3df6U, 1U, 0U, 0U},
+ // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
+ {0x3741U, 0x3f5cU, 1U, 0U, 1U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA
+ // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
+ {0x4030U, 0x57b1U, 1U, 0U, 1U},
+ // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
+ {0x406eU, 0x5917U, 1U, 0U, 1U},
+ // x = 0x1.2f4p+2, exp10m1f16(x) = 0x1.ab8p+15 (RZ)
+ {0x44bdU, 0x7aaeU, 1U, 0U, 1U},
+#endif
+ }};
+
+LLVM_LIBC_FUNCTION(float16, exp10m1f16, (float16 x)) {
+ using FPBits = fputil::FPBits<float16>;
+ FPBits x_bits(x);
+
+ uint16_t x_u = x_bits.uintval();
+ uint16_t x_abs = x_u & 0x7fffU;
+
+ // When |x| <= 2^(-3), or |x| >= 11 * log10(2), or x is NaN.
+ if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x429fU)) {
+ // exp10m1(NaN) = NaN
+ if (x_bits.is_nan()) {
+ if (x_bits.is_signaling_nan()) {
+ fputil::raise_except_if_required(FE_INVALID);
+ return FPBits::quiet_nan().get_val();
+ }
+
+ return x;
+ }
+
+ // When x >= 16 * log10(2).
+ if (x_u >= 0x44d1U && x_bits.is_pos()) {
+ // exp10m1(+inf) = +inf
+ if (x_bits.is_inf())
+ return FPBits::inf().get_val();
+
+ switch (fputil::quick_get_round()) {
+ case FE_TONEAREST:
+ case FE_UPWARD:
+ fputil::set_errno_if_required(ERANGE);
+ fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+ return FPBits::inf().get_val();
+ default:
+ return FPBits::max_normal().get_val();
+ }
+ }
+
+ // When x < -11 * log10(2).
+ if (x_u > 0xc29fU) {
+ // exp10m1(-inf) = -1
+ if (x_bits.is_inf())
+ return FPBits::one(Sign::NEG).get_val();
+
+ // When x >= -0x1.ce4p+1, round(10^x - 1, HP, RN) = -0x1.ffcp-1.
+ if (x_u <= 0xc339U) {
+ return fputil::round_result_slightly_down(
+ fputil::cast<float16>(-0x1.ffcp-1));
+ }
+
+ // When x < -0x1.ce4p+1, round(10^x - 1, HP, RN) = -1.
+ switch (fputil::quick_get_round()) {
+ case FE_TONEAREST:
+ case FE_DOWNWARD:
+ return FPBits::one(Sign::NEG).get_val();
+ default:
+ return fputil::cast<float16>(-0x1.ffcp-1);
+ }
+ }
+
+ // When |x| <= 2^(-3).
+ if (x_abs <= 0x3000U) {
+ if (auto r = EXP10M1F16_EXCEPTS_LO.lookup(x_u);
+ LIBC_UNLIKELY(r.has_value()))
+ return r.value();
+
+ float xf = x;
+ // Degree-5 minimax polynomial generated by Sollya with the following
+ // commands:
+ // > display = hexadecimal;
+ // > P = fpminimax((10^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
+ // > x * P;
+ return fputil::cast<float16>(
+ xf * fputil::polyeval(xf, 0x1.26bb1cp+1f, 0x1.5351c8p+1f,
+ 0x1.04704p+1f, 0x1.2ce084p+0f, 0x1.14a6bep-1f));
+ }
+ }
+
+ // When x is 1, 2, or 3. These are hard-to-round cases with exact results.
+ // 10^4 - 1 = 9'999 is not exactly representable as a float16, but luckily the
+ // polynomial approximation gives the correct result for x = 4 in all
+ // rounding modes.
+ if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
+ switch (x_u) {
+ case 0x3c00U: // x = 1.0f16
+ return fputil::cast<float16>(9.0);
+ case 0x4000U: // x = 2.0f16
+ return fputil::cast<float16>(99.0);
+ case 0x4200U: // x = 3.0f16
+ return fputil::cast<float16>(999.0);
+ }
+ }
+
+ if (auto r = EXP10M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+ return r.value();
+
+ // exp10(x) = exp2((hi + mid) * log2(10)) * exp10(lo)
+ auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+ // exp10m1(x) = exp2((hi + mid) * log2(lo)) * exp10(lo) - 1
+ return fputil::cast<float16>(
+ fputil::multiply_add(exp2_hi_mid, exp10_lo, -1.0f));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index 3529413..8de329b 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -127,6 +127,53 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) {
return {exp2_hi_mid, exp2_lo};
}
+// Generated by Sollya with the following commands:
+// > display = hexadecimal;
+// > round(log2(10), SG, RN);
+static constexpr float LOG2F_10 = 0x1.a934fp+1f;
+
+// Generated by Sollya with the following commands:
+// > display = hexadecimal;
+// > round(log10(2), SG, RN);
+static constexpr float LOG10F_2 = 0x1.344136p-2f;
+
+LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) {
+ // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
+ // find hi, mid, lo, such that:
+ // x = (hi + mid) * log2(10) + lo, in which
+ // hi is an integer,
+ // mid * 2^3 is an integer,
+ // -2^(-4) <= lo < 2^(-4).
+ // In particular,
+ // hi + mid = round(x * 2^3) * 2^(-3).
+ // Then,
+ // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
+ // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
+ // by adding hi to the exponent field of 2^mid. 10^lo is computed using a
+ // degree-4 minimax polynomial generated by Sollya.
+
+ float xf = x;
+ float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
+ int x_hi_mid = static_cast<int>(kf);
+ int x_hi = x_hi_mid >> 3;
+ int x_mid = x_hi_mid & 0x7;
+ // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
+ float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
+
+ uint32_t exp2_hi_mid_bits =
+ EXP2_MID_BITS[x_mid] +
+ static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
+ float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
+ // Degree-4 minimax polynomial generated by Sollya with the following
+ // commands:
+ // > display = hexadecimal;
+ // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
+ // > 1 + x * P;
+ float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
+ 0x1.04b434p+1f, 0x1.2bcf9ep+0f);
+ return {exp2_hi_mid, exp10_lo};
+}
+
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 12e1d07..5dff0b4 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1063,6 +1063,17 @@ add_fp_unittest(
)
add_fp_unittest(
+ exp10m1f16_test
+ NEED_MPFR
+ SUITE
+ libc-math-unittests
+ SRCS
+ exp10m1f16_test.cpp
+ DEPENDS
+ libc.src.math.exp10m1f16
+)
+
+add_fp_unittest(
copysign_test
SUITE
libc-math-unittests
diff --git a/libc/test/src/math/exp10m1f16_test.cpp b/libc/test/src/math/exp10m1f16_test.cpp
new file mode 100644
index 0000000..41bb12f
--- /dev/null
+++ b/libc/test/src/math/exp10m1f16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for exp10m1f16 ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10m1f16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcExp10m1f16Test, PositiveRange) {
+ for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+ float16 x = FPBits(v).get_val();
+ EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
+ LIBC_NAMESPACE::exp10m1f16(x), 0.5);
+ }
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, NegativeRange) {
+ for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+ float16 x = FPBits(v).get_val();
+ EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
+ LIBC_NAMESPACE::exp10m1f16(x), 0.5);
+ }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 447ea69..6b3623d 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1236,6 +1236,19 @@ add_fp_unittest(
)
add_fp_unittest(
+ exp10m1f16_test
+ SUITE
+ libc-math-smoke-tests
+ SRCS
+ exp10m1f16_test.cpp
+ DEPENDS
+ libc.hdr.fenv_macros
+ libc.src.errno.errno
+ libc.src.math.exp10m1f16
+ libc.src.__support.FPUtil.cast
+)
+
+add_fp_unittest(
copysign_test
SUITE
libc-math-smoke-tests
diff --git a/libc/test/src/math/smoke/exp10m1f16_test.cpp b/libc/test/src/math/smoke/exp10m1f16_test.cpp
new file mode 100644
index 0000000..dfa7fa4
--- /dev/null
+++ b/libc/test/src/math/smoke/exp10m1f16_test.cpp
@@ -0,0 +1,113 @@
+//===-- Unittests for exp10m1f16 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/exp10m1f16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) {
+ LIBC_NAMESPACE::libc_errno = 0;
+
+ EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10m1f16(aNaN));
+ EXPECT_MATH_ERRNO(0);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f16(sNaN),
+ FE_INVALID);
+ EXPECT_MATH_ERRNO(0);
+
+ EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp10m1f16(inf));
+ EXPECT_MATH_ERRNO(0);
+
+ EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+ LIBC_NAMESPACE::exp10m1f16(neg_inf));
+ EXPECT_MATH_ERRNO(0);
+
+ EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp10m1f16(zero));
+ EXPECT_MATH_ERRNO(0);
+
+ EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::exp10m1f16(neg_zero));
+ EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, Overflow) {
+ LIBC_NAMESPACE::libc_errno = 0;
+
+ EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f16(max_normal),
+ FE_OVERFLOW | FE_INEXACT);
+ EXPECT_MATH_ERRNO(ERANGE);
+
+ // round(16 * log10(2), HP, RN);
+ float16 x = LIBC_NAMESPACE::fputil::cast<float16>(0x1.344p+2);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+ inf, LIBC_NAMESPACE::exp10m1f16(x), FE_OVERFLOW | FE_INEXACT);
+ EXPECT_MATH_ERRNO(ERANGE);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+ inf, LIBC_NAMESPACE::exp10m1f16(x), FE_OVERFLOW | FE_INEXACT);
+ EXPECT_MATH_ERRNO(ERANGE);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+ max_normal, LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+ EXPECT_MATH_ERRNO(0);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+ max_normal, LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+ EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, ResultNearNegOne) {
+ LIBC_NAMESPACE::libc_errno = 0;
+
+ EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+ LIBC_NAMESPACE::exp10m1f16(neg_max_normal),
+ FE_INEXACT);
+
+ // round(-11 * log10(2), HP, RD);
+ float16 x = LIBC_NAMESPACE::fputil::cast<float16>(-0x1.a8p+1);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+ LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+ LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+ LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+ LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+ LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+ LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+ LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+ LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+ // Next float16 value below -0x1.ce4p+1.
+ x = LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ce8p+1);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+ LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+ LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+ LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+ LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+ LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+ LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+ EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+ LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+ LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+}
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index eecffc7..bd4fbe2 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -334,6 +334,29 @@ public:
return result;
}
+ MPFRNumber exp10m1() const {
+ // TODO: Only use mpfr_exp10m1 once CI and buildbots get MPFR >= 4.2.0.
+#if MPFR_VERSION_MAJOR > 4 || \
+ (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+ MPFRNumber result(*this);
+ mpfr_exp10m1(result.value, value, mpfr_rounding);
+ return result;
+#else
+ unsigned int prec = mpfr_precision * 3;
+ MPFRNumber result(*this, prec);
+
+ MPFRNumber ln10(10.0f, prec);
+ // log(10)
+ mpfr_log(ln10.value, ln10.value, mpfr_rounding);
+ // x * log(10)
+ mpfr_mul(result.value, value, ln10.value, mpfr_rounding);
+ // e^(x * log(10)) - 1
+ int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
+ mpfr_subnormalize(result.value, ex, mpfr_rounding);
+ return result;
+#endif
+ }
+
MPFRNumber expm1() const {
MPFRNumber result(*this);
mpfr_expm1(result.value, value, mpfr_rounding);
@@ -744,6 +767,8 @@ unary_operation(Operation op, InputType input, unsigned int precision,
return mpfrInput.exp2m1();
case Operation::Exp10:
return mpfrInput.exp10();
+ case Operation::Exp10m1:
+ return mpfrInput.exp10m1();
case Operation::Expm1:
return mpfrInput.expm1();
case Operation::Floor:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 8d51fa4..9fc12a6 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -42,6 +42,7 @@ enum class Operation : int {
Exp2,
Exp2m1,
Exp10,
+ Exp10m1,
Expm1,
Floor,
Log,
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index 63e4176..cfa7212 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -168,7 +168,7 @@
"`LWG3672 <https://wg21.link/LWG3672>`__","``common_iterator::operator->()`` should return by value","2022-07 (Virtual)","|Complete|","19.0",""
"`LWG3683 <https://wg21.link/LWG3683>`__","``operator==`` for ``polymorphic_allocator`` cannot deduce template argument in common cases","2022-07 (Virtual)","|Complete|","20.0",""
"`LWG3687 <https://wg21.link/LWG3687>`__","``expected<cv void, E>`` move constructor should move","2022-07 (Virtual)","|Complete|","16.0",""
-"`LWG3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","","",""
+"`LWG3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","|Complete|","20.0",""
"`LWG3701 <https://wg21.link/LWG3701>`__","Make ``formatter<remove_cvref_t<const charT[N]>, charT>`` requirement explicit","2022-07 (Virtual)","|Complete|","15.0",""
"`LWG3702 <https://wg21.link/LWG3702>`__","Should ``zip_transform_view::iterator`` remove ``operator<``","2022-07 (Virtual)","","",""
"`LWG3703 <https://wg21.link/LWG3703>`__","Missing requirements for ``expected<T, E>`` requires ``is_void<T>``","2022-07 (Virtual)","|Complete|","16.0",""
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index da7b588..c64f1c4 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -60,7 +60,7 @@
"`P1642R11 <https://wg21.link/P1642R11>`__","Freestanding ``[utilities]``, ``[ranges]``, and ``[iterators]``","2022-07 (Virtual)","","",""
"`P1899R3 <https://wg21.link/P1899R3>`__","``stride_view``","2022-07 (Virtual)","","",""
"`P2093R14 <https://wg21.link/P2093R14>`__","Formatted output","2022-07 (Virtual)","|Complete|","18.0",""
-"`P2165R4 <https://wg21.link/P2165R4>`__","Compatibility between ``tuple``, ``pair`` and ``tuple-like`` objects","2022-07 (Virtual)","","",""
+"`P2165R4 <https://wg21.link/P2165R4>`__","Compatibility between ``tuple``, ``pair`` and ``tuple-like`` objects","2022-07 (Virtual)","|Partial|","","Only the part for ``zip_view`` is implemented."
"`P2278R4 <https://wg21.link/P2278R4>`__","``cbegin`` should always return a constant iterator","2022-07 (Virtual)","","",""
"`P2286R8 <https://wg21.link/P2286R8>`__","Formatting Ranges","2022-07 (Virtual)","|Complete|","16.0",""
"`P2291R3 <https://wg21.link/P2291R3>`__","Add Constexpr Modifiers to Functions ``to_chars`` and ``from_chars`` for Integral Types in ``<charconv>`` Header","2022-07 (Virtual)","|Complete|","16.0",""
diff --git a/libcxx/include/__ranges/zip_view.h b/libcxx/include/__ranges/zip_view.h
index fe3c87a..835e23c 100644
--- a/libcxx/include/__ranges/zip_view.h
+++ b/libcxx/include/__ranges/zip_view.h
@@ -36,7 +36,6 @@
#include <__utility/forward.h>
#include <__utility/integer_sequence.h>
#include <__utility/move.h>
-#include <__utility/pair.h>
#include <tuple>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -58,22 +57,11 @@ concept __zip_is_common =
(!(bidirectional_range<_Ranges> && ...) && (common_range<_Ranges> && ...)) ||
((random_access_range<_Ranges> && ...) && (sized_range<_Ranges> && ...));
-template <typename _Tp, typename _Up>
-auto __tuple_or_pair_test() -> pair<_Tp, _Up>;
-
-template <typename... _Types>
- requires(sizeof...(_Types) != 2)
-auto __tuple_or_pair_test() -> tuple<_Types...>;
-
-template <class... _Types>
-using __tuple_or_pair = decltype(__tuple_or_pair_test<_Types...>());
-
template <class _Fun, class _Tuple>
_LIBCPP_HIDE_FROM_ABI constexpr auto __tuple_transform(_Fun&& __f, _Tuple&& __tuple) {
return std::apply(
[&]<class... _Types>(_Types&&... __elements) {
- return __tuple_or_pair<invoke_result_t<_Fun&, _Types>...>(
- std::invoke(__f, std::forward<_Types>(__elements))...);
+ return tuple<invoke_result_t<_Fun&, _Types>...>(std::invoke(__f, std::forward<_Types>(__elements))...);
},
std::forward<_Tuple>(__tuple));
}
@@ -88,7 +76,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __tuple_for_each(_Fun&& __f, _Tuple&& __tup
}
template <class _Fun, class _Tuple1, class _Tuple2, size_t... _Indices>
-_LIBCPP_HIDE_FROM_ABI constexpr __tuple_or_pair<
+_LIBCPP_HIDE_FROM_ABI constexpr tuple<
invoke_result_t<_Fun&,
typename tuple_element<_Indices, remove_cvref_t<_Tuple1>>::type,
typename tuple_element<_Indices, remove_cvref_t<_Tuple2>>::type>...>
@@ -250,10 +238,9 @@ template <input_range... _Views>
requires(view<_Views> && ...) && (sizeof...(_Views) > 0)
template <bool _Const>
class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base<_Const, _Views...> {
- __tuple_or_pair<iterator_t<__maybe_const<_Const, _Views>>...> __current_;
+ tuple<iterator_t<__maybe_const<_Const, _Views>>...> __current_;
- _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(
- __tuple_or_pair<iterator_t<__maybe_const<_Const, _Views>>...> __current)
+ _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(tuple<iterator_t<__maybe_const<_Const, _Views>>...> __current)
: __current_(std::move(__current)) {}
template <bool>
@@ -266,7 +253,7 @@ class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base
public:
using iterator_concept = decltype(__get_zip_view_iterator_tag<_Const, _Views...>());
- using value_type = __tuple_or_pair<range_value_t<__maybe_const<_Const, _Views>>...>;
+ using value_type = tuple<range_value_t<__maybe_const<_Const, _Views>>...>;
using difference_type = common_type_t<range_difference_t<__maybe_const<_Const, _Views>>...>;
_LIBCPP_HIDE_FROM_ABI __iterator() = default;
@@ -340,33 +327,8 @@ public:
}
}
- _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator<(const __iterator& __x, const __iterator& __y)
- requires __zip_all_random_access<_Const, _Views...>
- {
- return __x.__current_ < __y.__current_;
- }
-
- _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator>(const __iterator& __x, const __iterator& __y)
- requires __zip_all_random_access<_Const, _Views...>
- {
- return __y < __x;
- }
-
- _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator<=(const __iterator& __x, const __iterator& __y)
- requires __zip_all_random_access<_Const, _Views...>
- {
- return !(__y < __x);
- }
-
- _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator>=(const __iterator& __x, const __iterator& __y)
- requires __zip_all_random_access<_Const, _Views...>
- {
- return !(__x < __y);
- }
-
_LIBCPP_HIDE_FROM_ABI friend constexpr auto operator<=>(const __iterator& __x, const __iterator& __y)
- requires __zip_all_random_access<_Const, _Views...> &&
- (three_way_comparable<iterator_t<__maybe_const<_Const, _Views>>> && ...)
+ requires __zip_all_random_access<_Const, _Views...>
{
return __x.__current_ <=> __y.__current_;
}
@@ -427,10 +389,9 @@ template <input_range... _Views>
requires(view<_Views> && ...) && (sizeof...(_Views) > 0)
template <bool _Const>
class zip_view<_Views...>::__sentinel {
- __tuple_or_pair<sentinel_t<__maybe_const<_Const, _Views>>...> __end_;
+ tuple<sentinel_t<__maybe_const<_Const, _Views>>...> __end_;
- _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(
- __tuple_or_pair<sentinel_t<__maybe_const<_Const, _Views>>...> __end)
+ _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(tuple<sentinel_t<__maybe_const<_Const, _Views>>...> __end)
: __end_(__end) {}
friend class zip_view<_Views...>;
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index dfe552f..c481760 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -80,9 +80,6 @@ public:
pointer __end_;
_LIBCPP_COMPRESSED_PAIR(pointer, __end_cap_, allocator_type, __alloc_);
- using __alloc_ref = __add_lvalue_reference_t<allocator_type>;
- using __alloc_const_ref = __add_lvalue_reference_t<allocator_type>;
-
__split_buffer(const __split_buffer&) = delete;
__split_buffer& operator=(const __split_buffer&) = delete;
diff --git a/libcxx/include/future b/libcxx/include/future
index dfa373d..f16f423 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -594,7 +594,7 @@ public:
_LIBCPP_HIDE_FROM_ABI void set_value_at_thread_exit(_Arg&& __arg);
_LIBCPP_HIDE_FROM_ABI _Rp move();
- _LIBCPP_HIDE_FROM_ABI __add_lvalue_reference_t<_Rp> copy();
+ _LIBCPP_HIDE_FROM_ABI _Rp& copy();
};
template <class _Rp>
@@ -636,7 +636,7 @@ _Rp __assoc_state<_Rp>::move() {
}
template <class _Rp>
-__add_lvalue_reference_t<_Rp> __assoc_state<_Rp>::copy() {
+_Rp& __assoc_state<_Rp>::copy() {
unique_lock<mutex> __lk(this->__mut_);
this->__sub_wait(__lk);
if (this->__exception_ != nullptr)
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
index ea5953c..bdfd58ff 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
@@ -63,11 +63,7 @@ constexpr bool test() {
std::ranges::zip_view<std::ranges::zip_view<SizedRandomAccessView, SizedRandomAccessView>>> decltype(auto) v2 =
std::views::zip(v);
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- static_assert(std::is_same_v<std::ranges::range_reference_t<decltype(v2)>, std::tuple<std::pair<int&, int&>>>);
-#else
static_assert(std::is_same_v<std::ranges::range_reference_t<decltype(v2)>, std::tuple<std::tuple<int&, int&>>>);
-#endif
}
return true;
}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
index f532896..fdfcc02 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
@@ -49,12 +49,8 @@ constexpr bool test() {
using View = std::ranges::zip_view<DefaultConstructibleView, DefaultConstructibleView>;
View v = View(); // the default constructor is not explicit
assert(v.size() == 3);
- auto it = v.begin();
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- using Value = std::pair<const int&, const int&>;
-#else
+ auto it = v.begin();
using Value = std::tuple<const int&, const int&>;
-#endif
assert(*it++ == Value(buff[0], buff[0]));
assert(*it++ == Value(buff[1], buff[1]));
assert(*it == Value(buff[2], buff[2]));
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
index ed1cb0c..8ab7346 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
@@ -10,17 +10,8 @@
// friend constexpr bool operator==(const iterator& x, const iterator& y)
// requires (equality_comparable<iterator_t<maybe-const<Const, Views>>> && ...);
-// friend constexpr bool operator<(const iterator& x, const iterator& y)
-// requires all-random-access<Const, Views...>;
-// friend constexpr bool operator>(const iterator& x, const iterator& y)
-// requires all-random-access<Const, Views...>;
-// friend constexpr bool operator<=(const iterator& x, const iterator& y)
-// requires all-random-access<Const, Views...>;
-// friend constexpr bool operator>=(const iterator& x, const iterator& y)
-// requires all-random-access<Const, Views...>;
// friend constexpr auto operator<=>(const iterator& x, const iterator& y)
-// requires all-random-access<Const, Views...> &&
-// (three_way_comparable<iterator_t<maybe-const<Const, Views>>> && ...);
+// requires all-random-access<Const, Views...>;
#include <ranges>
#include <compare>
@@ -165,12 +156,7 @@ constexpr bool test() {
using Subrange = std::ranges::subrange<It>;
static_assert(!std::three_way_comparable<It>);
using R = std::ranges::zip_view<Subrange, Subrange>;
-#ifdef _LIBCPP_VERSION
- // libc++ hasn't implemented LWG-3692 "zip_view::iterator's operator<=> is overconstrained"
- static_assert(!std::three_way_comparable<std::ranges::iterator_t<R>>);
-#else
static_assert(std::three_way_comparable<std::ranges::iterator_t<R>>);
-#endif
int a[] = {1, 2, 3, 4};
int b[] = {5, 6, 7, 8, 9};
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
index 569d040..fb58aa2 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
@@ -42,11 +42,7 @@ constexpr bool test() {
auto [x, y] = *it;
assert(&x == &(a[0]));
assert(&y == &(b[0]));
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- static_assert(std::is_same_v<decltype(*it), std::pair<int&, double&>>);
-#else
static_assert(std::is_same_v<decltype(*it), std::tuple<int&, double&>>);
-#endif
x = 5;
y = 0.1;
@@ -70,11 +66,7 @@ constexpr bool test() {
auto it = v.begin();
assert(&(std::get<0>(*it)) == &(a[0]));
assert(&(std::get<1>(*it)) == &(a[0]));
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- static_assert(std::is_same_v<decltype(*it), std::pair<int&, int const&>>);
-#else
static_assert(std::is_same_v<decltype(*it), std::tuple<int&, int const&>>);
-#endif
}
return true;
}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
index c19f6c2..2f2f0fc 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
@@ -65,7 +65,7 @@ struct ConstVeryDifferentRange {
void test() {
int buffer[] = {1, 2, 3, 4};
{
- // 2 views should have pair value_type
+ // 2 views should have 2-tuple value_type
// random_access_iterator_tag
std::ranges::zip_view v(buffer, buffer);
using Iter = decltype(v.begin());
@@ -73,11 +73,7 @@ void test() {
static_assert(std::is_same_v<Iter::iterator_concept, std::random_access_iterator_tag>);
static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- static_assert(std::is_same_v<Iter::value_type, std::pair<int, int>>);
-#else
static_assert(std::is_same_v<Iter::value_type, std::tuple<int, int>>);
-#endif
static_assert(HasIterCategory<Iter>);
}
@@ -124,11 +120,7 @@ void test() {
static_assert(std::is_same_v<Iter::iterator_concept, std::random_access_iterator_tag>);
static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- static_assert(std::is_same_v<Iter::value_type, std::pair<int, std::pair<int, int>>>);
-#else
static_assert(std::is_same_v<Iter::value_type, std::tuple<int, std::tuple<int, int>>>);
-#endif
static_assert(HasIterCategory<Iter>);
}
@@ -169,11 +161,7 @@ void test() {
// value_type of multiple views with different value_type
std::ranges::zip_view v{foos, bars};
using Iter = decltype(v.begin());
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- static_assert(std::is_same_v<Iter::value_type, std::pair<Foo, Bar>>);
-#else
static_assert(std::is_same_v<Iter::value_type, std::tuple<Foo, Bar>>);
-#endif
}
{
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
index 1538d76..ba3abfa2 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
@@ -27,11 +27,7 @@ constexpr bool test() {
assert(it[2] == *(it + 2));
assert(it[4] == *(it + 4));
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- static_assert(std::is_same_v<decltype(it[2]), std::pair<int&, int>>);
-#else
static_assert(std::is_same_v<decltype(it[2]), std::tuple<int&, int>>);
-#endif
}
{
@@ -42,11 +38,7 @@ constexpr bool test() {
assert(it[2] == *(it + 2));
assert(it[4] == *(it + 4));
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
- static_assert(std::is_same_v<decltype(it[2]), std::pair<int&, int&>>);
-#else
static_assert(std::is_same_v<decltype(it[2]), std::tuple<int&, int&>>);
-#endif
}
{
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 1d5e6e0..63748a7 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -1267,7 +1267,7 @@ def run_vscode(dbg, args, options):
def main():
parser = optparse.OptionParser(
description=(
- "A testing framework for the Visual Studio Code Debug " "Adaptor protocol"
+ "A testing framework for the Visual Studio Code Debug Adaptor protocol"
)
)
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
index 1a6defb..7adc006 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
@@ -119,17 +119,15 @@ NativeRegisterContextFreeBSD_arm64::ReadRegister(const RegisterInfo *reg_info,
RegisterValue &reg_value) {
Status error;
- if (!reg_info) {
- error = Status::FromErrorString("reg_info NULL");
- return error;
- }
+ if (!reg_info)
+ return Status::FromErrorString("reg_info NULL");
const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB];
if (reg == LLDB_INVALID_REGNUM)
- return Status("no lldb regnum for %s", reg_info && reg_info->name
- ? reg_info->name
- : "<unknown register>");
+ return Status::FromErrorStringWithFormat(
+ "no lldb regnum for %s",
+ reg_info && reg_info->name ? reg_info->name : "<unknown register>");
uint32_t set = GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg);
error = ReadRegisterSet(set);
@@ -147,14 +145,14 @@ Status NativeRegisterContextFreeBSD_arm64::WriteRegister(
Status error;
if (!reg_info)
- return Status("reg_info NULL");
+ return Status::FromErrorString("reg_info NULL");
const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB];
if (reg == LLDB_INVALID_REGNUM)
- return Status("no lldb regnum for %s", reg_info && reg_info->name
- ? reg_info->name
- : "<unknown register>");
+ return Status::FromErrorStringWithFormat(
+ "no lldb regnum for %s",
+ reg_info && reg_info->name ? reg_info->name : "<unknown register>");
uint32_t set = GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg);
error = ReadRegisterSet(set);
diff --git a/lldb/source/Utility/DiagnosticsRendering.cpp b/lldb/source/Utility/DiagnosticsRendering.cpp
index 96caf93..d28a9ab 100644
--- a/lldb/source/Utility/DiagnosticsRendering.cpp
+++ b/lldb/source/Utility/DiagnosticsRendering.cpp
@@ -77,11 +77,7 @@ void RenderDiagnosticDetails(Stream &stream,
spacer = "";
}
- // Print a line with caret indicator(s) below the lldb prompt + command.
- const size_t padding = *offset_in_command;
- stream << std::string(padding, ' ');
-
- size_t offset = 1;
+ // Partition the diagnostics.
std::vector<DiagnosticDetail> remaining_details, other_details,
hidden_details;
for (const DiagnosticDetail &detail : details) {
@@ -98,14 +94,39 @@ void RenderDiagnosticDetails(Stream &stream,
continue;
}
- auto &loc = *detail.source_location;
remaining_details.push_back(detail);
- if (offset > loc.column)
- continue;
- stream << std::string(loc.column - offset, ' ') << cursor;
- for (unsigned i = 0; i + 1 < loc.length; ++i)
- stream << underline;
- offset = loc.column + 1;
+ }
+
+ // Sort the diagnostics.
+ auto sort = [](auto &ds) {
+ llvm::sort(ds.begin(), ds.end(), [](auto &d1, auto &d2) {
+ auto l1 = d1.source_location.value_or(DiagnosticDetail::SourceLocation{});
+ auto l2 = d2.source_location.value_or(DiagnosticDetail::SourceLocation{});
+ return std::pair(l1.line, l2.column) < std::pair(l1.line, l2.column);
+ });
+ };
+ sort(remaining_details);
+ sort(other_details);
+ sort(hidden_details);
+
+ // Print a line with caret indicator(s) below the lldb prompt + command.
+ const size_t padding = *offset_in_command;
+ stream << std::string(padding, ' ');
+ {
+ size_t x_pos = 1;
+ for (const DiagnosticDetail &detail : remaining_details) {
+ auto &loc = *detail.source_location;
+
+ if (x_pos > loc.column)
+ continue;
+
+ stream << std::string(loc.column - x_pos, ' ') << cursor;
+ ++x_pos;
+ for (unsigned i = 0; i + 1 < loc.length; ++i) {
+ stream << underline;
+ ++x_pos;
+ }
+ }
}
stream << '\n';
@@ -117,18 +138,19 @@ void RenderDiagnosticDetails(Stream &stream,
// Get the information to print this detail and remove it from the stack.
// Print all the lines for all the other messages first.
stream << std::string(padding, ' ');
- size_t offset = 1;
+ size_t x_pos = 1;
for (auto &remaining_detail :
llvm::ArrayRef(remaining_details).drop_back(1)) {
uint16_t column = remaining_detail.source_location->column;
- stream << std::string(column - offset, ' ') << vbar;
- offset = column + 1;
+ if (x_pos <= column)
+ stream << std::string(column - x_pos, ' ') << vbar;
+ x_pos = column + 1;
}
// Print the line connecting the ^ with the error message.
uint16_t column = detail->source_location->column;
- if (offset <= column)
- stream << std::string(column - offset, ' ') << joint << hbar << spacer;
+ if (x_pos <= column)
+ stream << std::string(column - x_pos, ' ') << joint << hbar << spacer;
// Print a colorized string based on the message's severity type.
PrintSeverity(stream, detail->severity);
diff --git a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
index 13ab6b0..bafc762 100644
--- a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
@@ -10,6 +10,7 @@ from lldbsuite.test import lldbutil
class TestCase(TestBase):
@add_test_categories(["libc++"])
@skipIf(compiler=no_match("clang"))
+ @skipIfLinux # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
def test(self):
self.build()
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
index 1c32222..71eaeef 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
@@ -14,6 +14,7 @@ class TestDbgInfoContentVector(TestBase):
@skipIf(compiler="clang", compiler_version=["<", "12.0"])
@skipIf(macos_version=["<", "14.0"])
@skipIfDarwin # https://github.com/llvm/llvm-project/issues/106475
+ @skipIfLinux # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
def test(self):
self.build()
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
index a1f3327..e9415fd 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
@@ -10,6 +10,7 @@ from lldbsuite.test import lldbutil
class TestVectorOfVectors(TestBase):
@add_test_categories(["libc++"])
@skipIf(compiler=no_match("clang"))
+ @skipIfLinux # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
def test(self):
self.build()
diff --git a/lldb/test/API/commands/expression/top-level/Makefile b/lldb/test/API/commands/expression/top-level/Makefile
index e5e9e78..51b27dd 100644
--- a/lldb/test/API/commands/expression/top-level/Makefile
+++ b/lldb/test/API/commands/expression/top-level/Makefile
@@ -5,6 +5,6 @@ all: dummy
include Makefile.rules
dummy: dummy.cpp
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
CXX_SOURCES=dummy.cpp EXE=dummy
diff --git a/lldb/test/API/commands/expression/weak_symbols/Makefile b/lldb/test/API/commands/expression/weak_symbols/Makefile
index 6fd8133..1636e9b 100644
--- a/lldb/test/API/commands/expression/weak_symbols/Makefile
+++ b/lldb/test/API/commands/expression/weak_symbols/Makefile
@@ -9,12 +9,12 @@ a.out: libdylib.dylib
include Makefile.rules
libdylib.dylib: dylib.c
- $(MAKE) -C $(BUILDDIR) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -C $(BUILDDIR) -f $(MAKEFILE_RULES) \
C_SOURCES= DYLIB_C_SOURCES=dylib.c DYLIB_NAME=dylib \
CFLAGS_EXTRAS=-DHAS_THEM LD_EXTRAS=-dynamiclib
hidden/libdylib.dylib:
mkdir hidden
- $(MAKE) -C $(BUILDDIR)/hidden -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -C $(BUILDDIR)/hidden -f $(MAKEFILE_RULES) \
C_SOURCES= DYLIB_C_SOURCES=dylib.c DYLIB_NAME=dylib \
LD_EXTRAS=-dynamiclib
diff --git a/lldb/test/API/commands/target/create-deps/Makefile b/lldb/test/API/commands/target/create-deps/Makefile
index 3e5b104..866d550 100644
--- a/lldb/test/API/commands/target/create-deps/Makefile
+++ b/lldb/test/API/commands/target/create-deps/Makefile
@@ -6,5 +6,5 @@ a.out: libload_a
include Makefile.rules
libload_a:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=load_a DYLIB_CXX_SOURCES=a.cpp
diff --git a/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile b/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
index 0f3fb37..112210e 100644
--- a/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
@@ -2,7 +2,7 @@ CXX_SOURCES := main.cpp
USE_LIBDL := 1
lib_b:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=lib_b
all: lib_b
diff --git a/lldb/test/API/functionalities/dlopen_other_executable/Makefile b/lldb/test/API/functionalities/dlopen_other_executable/Makefile
index 113b9fd..51fc01b 100644
--- a/lldb/test/API/functionalities/dlopen_other_executable/Makefile
+++ b/lldb/test/API/functionalities/dlopen_other_executable/Makefile
@@ -2,7 +2,7 @@ C_SOURCES := main.c
USE_LIBDL := 1
other:
- $(MAKE) -f $(MAKEFILE_RULES) C_SOURCES=other.c EXE=other
+ "$(MAKE)" -f $(MAKEFILE_RULES) C_SOURCES=other.c EXE=other
all: other
include Makefile.rules
diff --git a/lldb/test/API/functionalities/exec/Makefile b/lldb/test/API/functionalities/exec/Makefile
index 8b9148ea..65d4680 100644
--- a/lldb/test/API/functionalities/exec/Makefile
+++ b/lldb/test/API/functionalities/exec/Makefile
@@ -5,5 +5,5 @@ all: secondprog
include Makefile.rules
secondprog: secondprog.cpp
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
CXX_SOURCES=secondprog.cpp EXE=secondprog
diff --git a/lldb/test/API/functionalities/jitloader_gdb/Makefile b/lldb/test/API/functionalities/jitloader_gdb/Makefile
index 357b1f8..9998cc9 100644
--- a/lldb/test/API/functionalities/jitloader_gdb/Makefile
+++ b/lldb/test/API/functionalities/jitloader_gdb/Makefile
@@ -5,5 +5,5 @@ all: a.out simple
include Makefile.rules
simple:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
C_SOURCES=simple.c EXE=simple
diff --git a/lldb/test/API/functionalities/limit-debug-info/Makefile b/lldb/test/API/functionalities/limit-debug-info/Makefile
index 874b3a1..fa867a7 100644
--- a/lldb/test/API/functionalities/limit-debug-info/Makefile
+++ b/lldb/test/API/functionalities/limit-debug-info/Makefile
@@ -17,11 +17,11 @@ include Makefile.rules
a.out: libone libtwo
libone:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=one.cpp DYLIB_NAME=one \
CFLAGS_EXTRAS="$(ONE_CXXFLAGS)"
libtwo: libone
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=two.cpp DYLIB_NAME=two \
CFLAGS_EXTRAS="$(TWO_CXXFLAGS)" LD_EXTRAS="-L. -lone"
diff --git a/lldb/test/API/functionalities/load_after_attach/Makefile b/lldb/test/API/functionalities/load_after_attach/Makefile
index 0f3fb37..112210e 100644
--- a/lldb/test/API/functionalities/load_after_attach/Makefile
+++ b/lldb/test/API/functionalities/load_after_attach/Makefile
@@ -2,7 +2,7 @@ CXX_SOURCES := main.cpp
USE_LIBDL := 1
lib_b:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=lib_b
all: lib_b
diff --git a/lldb/test/API/functionalities/load_lazy/Makefile b/lldb/test/API/functionalities/load_lazy/Makefile
index 81bc7dc..8e1d06b 100644
--- a/lldb/test/API/functionalities/load_lazy/Makefile
+++ b/lldb/test/API/functionalities/load_lazy/Makefile
@@ -17,13 +17,13 @@ else
endif
t1: t2_0
- $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_C_SOURCES=t1.c DYLIB_NAME=t1 LD_EXTRAS="-L. $(LINKFLAGS)"
t2_0:
- $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_0.c DYLIB_NAME=t2_0
t2_1:
- $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_1.c DYLIB_NAME=t2_1
diff --git a/lldb/test/API/functionalities/load_unload/Makefile b/lldb/test/API/functionalities/load_unload/Makefile
index e73ec73..dd7d160 100644
--- a/lldb/test/API/functionalities/load_unload/Makefile
+++ b/lldb/test/API/functionalities/load_unload/Makefile
@@ -7,25 +7,25 @@ a.out: lib_b lib_a lib_c lib_d hidden_lib_d
include Makefile.rules
lib_a: lib_b
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=loadunload_a \
LD_EXTRAS="-L. -lloadunload_b"
lib_b:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=loadunload_b
lib_c:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=loadunload_c
lib_d:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload_d
ifeq ($(OS),Darwin)
install_name_tool -id @executable_path/libloadunload_d.dylib libloadunload_d.dylib
endif
hidden_lib_d: hidden
- $(MAKE) VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
+ "$(MAKE)" VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload_d
diff --git a/lldb/test/API/functionalities/load_using_paths/Makefile b/lldb/test/API/functionalities/load_using_paths/Makefile
index 814a960..f973a38 100644
--- a/lldb/test/API/functionalities/load_using_paths/Makefile
+++ b/lldb/test/API/functionalities/load_using_paths/Makefile
@@ -6,6 +6,6 @@ all: hidden_lib a.out
include Makefile.rules
hidden_lib:
- $(MAKE) VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
+ "$(MAKE)" VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload
diff --git a/lldb/test/API/functionalities/scripted_process/Makefile b/lldb/test/API/functionalities/scripted_process/Makefile
index ba73945..d4f12fb 100644
--- a/lldb/test/API/functionalities/scripted_process/Makefile
+++ b/lldb/test/API/functionalities/scripted_process/Makefile
@@ -9,7 +9,7 @@ CXXFLAGS_EXTRAS := -target $(TRIPLE)
all: libbaz.dylib a.out
libbaz.dylib: baz.cpp
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=baz DYLIB_CXX_SOURCES=baz.cpp
include Makefile.rules
diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
index 4abcab8..e4b0e86 100644
--- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
+++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
@@ -6,11 +6,11 @@ a.out: lib_a lib_b
include Makefile.rules
lib_a:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=load_a
lib_b:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=load_b
diff --git a/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile b/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
index 42c010b..963ce2a 100644
--- a/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
+++ b/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
@@ -10,4 +10,4 @@ a.out: lib_One lib_Two
lib_One: lib_Two
lib_%:
- $(MAKE) VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk DSYMUTIL=$(DSYMUTIL)
+ "$(MAKE)" VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk DSYMUTIL=$(DSYMUTIL)
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
index 6c61d21..e3b4869 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
@@ -1,23 +1,23 @@
CXX_SOURCES := main.cpp
-LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
+LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
a.out: lib_b lib_a lib_c lib_d
include Makefile.rules
lib_a: lib_b
- $(MAKE) -f $(MAKEFILE_RULES) \
- DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
- LD_EXTRAS="-L. -l_b"
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
+ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
+ LD_EXTRAS="-L. -l_b"
lib_b:
- $(MAKE) -f $(MAKEFILE_RULES) \
- DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
+ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
lib_c:
- $(MAKE) -f $(MAKEFILE_RULES) \
- DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
+ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
lib_d:
- $(MAKE) -f $(MAKEFILE_RULES) \
- DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
+ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
diff --git a/lldb/test/API/lang/c/conflicting-symbol/Makefile b/lldb/test/API/lang/c/conflicting-symbol/Makefile
index 81594a1..1331c4e 100644
--- a/lldb/test/API/lang/c/conflicting-symbol/Makefile
+++ b/lldb/test/API/lang/c/conflicting-symbol/Makefile
@@ -7,4 +7,4 @@ include Makefile.rules
a.out: lib_One lib_Two
lib_%:
- $(MAKE) VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk
+ "$(MAKE)" VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk
diff --git a/lldb/test/API/lang/cpp/incomplete-types/Makefile b/lldb/test/API/lang/cpp/incomplete-types/Makefile
index f42ac2e..0cf3f6a 100644
--- a/lldb/test/API/lang/cpp/incomplete-types/Makefile
+++ b/lldb/test/API/lang/cpp/incomplete-types/Makefile
@@ -16,7 +16,7 @@ main.o: CFLAGS_EXTRAS = -flimit-debug-info
limit: a.o main.o
mkdir -p build_limit
- $(MAKE) -C $(BUILDDIR)/build_limit -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -C $(BUILDDIR)/build_limit -f $(MAKEFILE_RULES) \
EXE=../limit CXX_SOURCES="length.cpp ../a.o ../main.o" \
CFLAGS_EXTRAS=-flimit-debug-info NO_LIMIT_DEBUG_INFO_FLAGS=""
diff --git a/lldb/test/API/lang/cpp/namespace_definitions/Makefile b/lldb/test/API/lang/cpp/namespace_definitions/Makefile
index fc9165f..b17d70f 100644
--- a/lldb/test/API/lang/cpp/namespace_definitions/Makefile
+++ b/lldb/test/API/lang/cpp/namespace_definitions/Makefile
@@ -6,10 +6,10 @@ a.out: liba libb
include Makefile.rules
liba:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=a DYLIB_CXX_SOURCES=a.cpp
libb:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=b DYLIB_CXX_SOURCES=b.cpp
diff --git a/lldb/test/API/lang/cpp/stl/Makefile b/lldb/test/API/lang/cpp/stl/Makefile
index 4408691..99998b2 100644
--- a/lldb/test/API/lang/cpp/stl/Makefile
+++ b/lldb/test/API/lang/cpp/stl/Makefile
@@ -1,9 +1,3 @@
CXX_SOURCES := main.cpp
-ifneq ($(OS),Darwin)
- USE_LIBSTDCPP := 1
-else
- USE_SYSTEM_STDLIB := 1
-endif
-
include Makefile.rules
diff --git a/lldb/test/API/lang/objc/conflicting-definition/Makefile b/lldb/test/API/lang/objc/conflicting-definition/Makefile
index 00a0769..cba79c9 100644
--- a/lldb/test/API/lang/objc/conflicting-definition/Makefile
+++ b/lldb/test/API/lang/objc/conflicting-definition/Makefile
@@ -9,14 +9,14 @@ include Makefile.rules
libTest.dylib: Test/Test.m
mkdir -p Test
- $(MAKE) MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
+ "$(MAKE)" MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=Test DYLIB_OBJC_SOURCES=Test/Test.m \
LD_EXTRAS="-lobjc -framework Foundation" \
CFLAGS_EXTRAS=-I$(SRCDIR)
libTestExt.dylib: TestExt/TestExt.m
mkdir -p TestExt
- $(MAKE) MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
+ "$(MAKE)" MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=TestExt DYLIB_OBJC_SOURCES=TestExt/TestExt.m \
LD_EXTRAS="-lobjc -framework Foundation -lTest -L." \
CFLAGS_EXTRAS=-I$(SRCDIR)
diff --git a/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile b/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
index 59bf009..57da670 100644
--- a/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
+++ b/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
@@ -5,7 +5,7 @@ USE_PRIVATE_MODULE_CACHE = YES
.PHONY: update-module
all: $(EXE)
- $(MAKE) -f $(SRCDIR)/Makefile update-module
+ "$(MAKE)" -f $(SRCDIR)/Makefile update-module
include Makefile.rules
diff --git a/lldb/test/API/macosx/delay-init-dependency/Makefile b/lldb/test/API/macosx/delay-init-dependency/Makefile
index 246ea0f..7421c68 100644
--- a/lldb/test/API/macosx/delay-init-dependency/Makefile
+++ b/lldb/test/API/macosx/delay-init-dependency/Makefile
@@ -7,5 +7,5 @@ all: build-libfoo a.out
include Makefile.rules
build-libfoo: foo.c
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES
diff --git a/lldb/test/API/macosx/expedited-thread-pcs/Makefile b/lldb/test/API/macosx/expedited-thread-pcs/Makefile
index 7799f06..73a9698 100644
--- a/lldb/test/API/macosx/expedited-thread-pcs/Makefile
+++ b/lldb/test/API/macosx/expedited-thread-pcs/Makefile
@@ -6,6 +6,6 @@ all: build-libfoo a.out
include Makefile.rules
build-libfoo: foo.c
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES
diff --git a/lldb/test/API/macosx/indirect_symbol/Makefile b/lldb/test/API/macosx/indirect_symbol/Makefile
index 9069302..dee3e18 100644
--- a/lldb/test/API/macosx/indirect_symbol/Makefile
+++ b/lldb/test/API/macosx/indirect_symbol/Makefile
@@ -7,11 +7,11 @@ all: build-libindirect build-libreepxoprt a.out
include Makefile.rules
build-libindirect: indirect.c
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_C_SOURCES=indirect.c DYLIB_NAME=indirect DYLIB_ONLY=YES \
LD_EXTRAS="-Wl,-image_base,0x200000000"
build-libreepxoprt: reexport.c
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_C_SOURCES=reexport.c DYLIB_NAME=reexport DYLIB_ONLY=YES \
LD_EXTRAS="-L. -lindirect -Wl,-alias_list,$(SRCDIR)/alias.list"
diff --git a/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile b/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
index 05d9552..01b4acf 100644
--- a/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
+++ b/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
@@ -5,7 +5,7 @@ C_SOURCES := main.c
all: a.out create-empty-corefile
create-empty-corefile:
- $(MAKE) -f $(MAKEFILE_RULES) EXE=create-empty-corefile \
+ "$(MAKE)" -f $(MAKEFILE_RULES) EXE=create-empty-corefile \
CXX=$(CC) CXX_SOURCES=create-empty-corefile.cpp
include Makefile.rules
diff --git a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
index 8e561f1..229235c 100644
--- a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
+++ b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
@@ -10,11 +10,11 @@ create-empty-corefile:
CXX_SOURCES=create-multibin-corefile.cpp
libone.dylib: one.c
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=one DYLIB_C_SOURCES=one.c
libtwo.dylib: two.c
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=two DYLIB_C_SOURCES=two.c
include Makefile.rules
diff --git a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
index c77a186..0dc9e71 100644
--- a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
+++ b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
@@ -11,7 +11,7 @@ override CC=xcrun clang
all: libfoo.dylib a.out
libfoo.dylib: foo.c
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_NAME=foo DYLIB_C_SOURCES=foo.c
include Makefile.rules
diff --git a/lldb/test/API/macosx/skinny-corefile/Makefile b/lldb/test/API/macosx/skinny-corefile/Makefile
index efe37f3..fce43a3 100644
--- a/lldb/test/API/macosx/skinny-corefile/Makefile
+++ b/lldb/test/API/macosx/skinny-corefile/Makefile
@@ -6,10 +6,10 @@ include Makefile.rules
a.out: libto-be-removed libpresent
libto-be-removed: libpresent
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_C_SOURCES=to-be-removed.c DYLIB_NAME=to-be-removed \
LD_EXTRAS="-L. -lpresent"
libpresent:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_C_SOURCES=present.c DYLIB_NAME=present
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/Makefile b/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
index 30a6400..7634f51 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
@@ -15,5 +15,5 @@ main-copy.cpp: main.cpp
# The following shared library will be used to test breakpoints under dynamic loading
libother: other-copy.c
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_ONLY=YES DYLIB_C_SOURCES=other-copy.c DYLIB_NAME=other
diff --git a/lldb/test/API/tools/lldb-dap/send-event/Makefile b/lldb/test/API/tools/lldb-dap/send-event/Makefile
new file mode 100644
index 0000000..1049594
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/send-event/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
new file mode 100644
index 0000000..de47651
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
@@ -0,0 +1,67 @@
+"""
+Test lldb-dap send-event integration.
+"""
+
+import json
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+import lldbdap_testcase
+
+
+class TestDAP_sendEvent(lldbdap_testcase.DAPTestCaseBase):
+ def test_send_event(self):
+ """
+ Test sending a custom event.
+ """
+ program = self.getBuildArtifact("a.out")
+ source = "main.c"
+ custom_event_body = {
+ "key": 321,
+ "arr": [True],
+ }
+ self.build_and_launch(
+ program,
+ stopCommands=[
+ "lldb-dap send-event my-custom-event-no-body",
+ "lldb-dap send-event my-custom-event '{}'".format(
+ json.dumps(custom_event_body)
+ ),
+ ],
+ )
+
+ breakpoint_line = line_number(source, "// breakpoint")
+
+ self.set_source_breakpoints(source, [breakpoint_line])
+ self.continue_to_next_stop()
+
+ custom_event = self.dap_server.wait_for_event(
+ filter=["my-custom-event-no-body"]
+ )
+ self.assertEquals(custom_event["event"], "my-custom-event-no-body")
+ self.assertIsNone(custom_event.get("body", None))
+
+ custom_event = self.dap_server.wait_for_event(filter=["my-custom-event"])
+ self.assertEquals(custom_event["event"], "my-custom-event")
+ self.assertEquals(custom_event["body"], custom_event_body)
+
+ def test_send_internal_event(self):
+ """
+ Test sending an internal event produces an error.
+ """
+ program = self.getBuildArtifact("a.out")
+ source = "main.c"
+ self.build_and_launch(program)
+
+ breakpoint_line = line_number(source, "// breakpoint")
+
+ self.set_source_breakpoints(source, [breakpoint_line])
+ self.continue_to_next_stop()
+
+ resp = self.dap_server.request_evaluate(
+ "`lldb-dap send-event stopped", context="repl"
+ )
+ self.assertRegex(
+ resp["body"]["result"],
+ r"Invalid use of lldb-dap send-event, event \"stopped\" should be handled by lldb-dap internally.",
+ )
diff --git a/lldb/test/API/tools/lldb-dap/send-event/main.c b/lldb/test/API/tools/lldb-dap/send-event/main.c
new file mode 100644
index 0000000..27bc22b
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/send-event/main.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(int argc, char const *argv[]) {
+ printf("example\n"); // breakpoint 1
+ return 0;
+}
diff --git a/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile b/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
index 5b5c1dc..f13b1ac 100644
--- a/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
+++ b/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
@@ -9,11 +9,11 @@ a.out: svr4lib_a svr4lib_b_quote
include Makefile.rules
svr4lib_a:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_NAME=svr4lib_a DYLIB_CXX_SOURCES=svr4lib_a.cpp \
DYLIB_ONLY=YES
svr4lib_b_quote:
- $(MAKE) -f $(MAKEFILE_RULES) \
+ "$(MAKE)" -f $(MAKEFILE_RULES) \
DYLIB_NAME=svr4lib_b\\\" DYLIB_CXX_SOURCES=svr4lib_b_quote.cpp \
DYLIB_ONLY=YES
diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp
index 1488199..e198bf0 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp
@@ -1,6 +1,6 @@
// XFAIL: *
-// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
// RUN: %lldb %t \
// RUN: -o "expr alignof(OverlappingFields)" \
// RUN: -o "expr sizeof(OverlappingFields)" \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp
index 15d8de0..c4bcfc4 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp
@@ -1,6 +1,6 @@
// XFAIL: *
-// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
// RUN: %lldb %t \
// RUN: -o "expr alignof(OverlappingDerived)" \
// RUN: -o "expr sizeof(OverlappingDerived)" \
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index f22d626..07211c6 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -176,9 +176,6 @@ void append_hexified_string(std::ostream &ostrm, const std::string &string) {
}
}
-extern void ASLLogCallback(void *baton, uint32_t flags, const char *format,
- va_list args);
-
// from System.framework/Versions/B/PrivateHeaders/sys/codesign.h
extern "C" {
#define CS_OPS_STATUS 0 /* return status */
@@ -1773,8 +1770,6 @@ static std::string get_value(std::string &line) {
extern void FileLogCallback(void *baton, uint32_t flags, const char *format,
va_list args);
-extern void ASLLogCallback(void *baton, uint32_t flags, const char *format,
- va_list args);
rnb_err_t RNBRemote::HandlePacket_qRcmd(const char *p) {
const char *c = p + strlen("qRcmd,");
@@ -1809,8 +1804,8 @@ rnb_err_t RNBRemote::HandlePacket_qRcmd(const char *p) {
static_cast<uint32_t>(strtoul(value.c_str(), &end, 0));
if (errno == 0 && end && *end == '\0') {
DNBLogSetLogMask(logmask);
- if (!DNBLogGetLogCallback())
- DNBLogSetLogCallback(ASLLogCallback, NULL);
+ if (auto log_callback = OsLogger::GetLogFunction())
+ DNBLogSetLogCallback(log_callback, nullptr);
return SendPacket("OK");
}
errno = 0;
@@ -2177,13 +2172,8 @@ rnb_err_t set_logging(const char *p) {
// Enable DNB logging.
// Use the existing log callback if one was already configured.
if (!DNBLogGetLogCallback()) {
- // Use the os_log()-based logger if available; otherwise,
- // fallback to ASL.
- auto log_callback = OsLogger::GetLogFunction();
- if (log_callback)
+ if (auto log_callback = OsLogger::GetLogFunction())
DNBLogSetLogCallback(log_callback, nullptr);
- else
- DNBLogSetLogCallback(ASLLogCallback, nullptr);
}
// Update logging to use the configured log channel bitmask.
diff --git a/lldb/tools/debugserver/source/libdebugserver.cpp b/lldb/tools/debugserver/source/libdebugserver.cpp
index 6da3708..17a5c13 100644
--- a/lldb/tools/debugserver/source/libdebugserver.cpp
+++ b/lldb/tools/debugserver/source/libdebugserver.cpp
@@ -311,13 +311,6 @@ RNBRunLoopMode RNBRunLoopInferiorExecuting(RNBRemoteSP &remote) {
return mode;
}
-void ASLLogCallback(void *baton, uint32_t flags, const char *format,
- va_list args) {
-#if 0
- vprintf(format, args);
-#endif
-}
-
extern "C" int debug_server_main(int fd) {
#if 1
g_isatty = 0;
@@ -327,7 +320,6 @@ extern "C" int debug_server_main(int fd) {
DNBLogSetDebug(1);
DNBLogSetVerbose(1);
DNBLogSetLogMask(-1);
- DNBLogSetLogCallback(ASLLogCallback, NULL);
#endif
signal(SIGPIPE, signal_handler);
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 119779d..68559e3 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -962,6 +962,68 @@ bool ReplModeRequestHandler::DoExecute(lldb::SBDebugger debugger,
return true;
}
+// Sends a DAP event with an optional body.
+//
+// See
+// https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent
+bool SendEventRequestHandler::DoExecute(lldb::SBDebugger debugger,
+ char **command,
+ lldb::SBCommandReturnObject &result) {
+ // Command format like: `send-event <name> <body>?`
+ if (!command || !command[0] || llvm::StringRef(command[0]).empty()) {
+ result.SetError("Not enough arguments found, expected format "
+ "`lldb-dap send-event <name> <body>?`.");
+ return false;
+ }
+
+ llvm::StringRef name{command[0]};
+ // Events that are stateful and should be handled by lldb-dap internally.
+ const std::array internal_events{"breakpoint", "capabilities", "continued",
+ "exited", "initialize", "loadedSource",
+ "module", "process", "stopped",
+ "terminated", "thread"};
+ if (std::find(internal_events.begin(), internal_events.end(), name) !=
+ std::end(internal_events)) {
+ std::string msg =
+ llvm::formatv("Invalid use of lldb-dap send-event, event \"{0}\" "
+ "should be handled by lldb-dap internally.",
+ name)
+ .str();
+ result.SetError(msg.c_str());
+ return false;
+ }
+
+ llvm::json::Object event(CreateEventObject(name));
+
+ if (command[1] && !llvm::StringRef(command[1]).empty()) {
+ // See if we have unused arguments.
+ if (command[2]) {
+ result.SetError(
+ "Additional arguments found, expected `lldb-dap send-event "
+ "<name> <body>?`.");
+ return false;
+ }
+
+ llvm::StringRef raw_body{command[1]};
+
+ llvm::Expected<llvm::json::Value> body = llvm::json::parse(raw_body);
+
+ if (!body) {
+ llvm::Error err = body.takeError();
+ std::string msg = "Failed to parse custom event body: " +
+ llvm::toString(std::move(err));
+ result.SetError(msg.c_str());
+ return false;
+ }
+
+ event.try_emplace("body", std::move(*body));
+ }
+
+ g_dap.SendJSON(llvm::json::Value(std::move(event)));
+ result.SetStatus(lldb::eReturnStatusSuccessFinishNoResult);
+ return true;
+}
+
void DAP::SetFrameFormat(llvm::StringRef format) {
if (format.empty())
return;
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index ba6d3d8..acc10ad 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -144,6 +144,11 @@ struct ReplModeRequestHandler : public lldb::SBCommandPluginInterface {
lldb::SBCommandReturnObject &result) override;
};
+struct SendEventRequestHandler : public lldb::SBCommandPluginInterface {
+ bool DoExecute(lldb::SBDebugger debugger, char **command,
+ lldb::SBCommandReturnObject &result) override;
+};
+
struct DAP {
std::string debug_adaptor_path;
InputStream input;
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 11086eb..42b5f50 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -290,6 +290,37 @@ The initial repl-mode can be configured with the cli flag `--repl-mode=<mode>`
and may also be adjusted at runtime using the lldb command
`lldb-dap repl-mode <mode>`.
+#### `lldb-dap send-event`
+
+lldb-dap includes a command to trigger a Debug Adapter Protocol event
+from a script.
+
+The event maybe a custom DAP event or a standard event, if the event is not
+handled internally by `lldb-dap`.
+
+This command has the format:
+
+```
+lldb-dap send-event <name> <body>?
+```
+
+For example you can use a launch configuration hook to trigger custom events like:
+
+```json
+{
+ "program": "exe",
+ "stopCommands": [
+ "lldb-dap send-event MyStopEvent",
+ "lldb-dap send-event MyStopEvent '{\"key\": 321}",
+ ]
+}
+```
+
+[See the specification](https://microsoft.github.io/debug-adapter-protocol/specification#Base_Protocol_Event)
+for more details on Debug Adapter Protocol events and the VS Code
+[debug.onDidReceiveDebugSessionCustomEvent](https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent)
+API for handling a custom event from an extension.
+
## Contributing
`lldb-dap` and `lldb` are developed under the umbrella of the [LLVM project](https://llvm.org/).
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 5e351ab..f70b0d3 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -1896,6 +1896,8 @@ void request_initialize(const llvm::json::Object &request) {
cmd.AddCommand(
"repl-mode", new ReplModeRequestHandler(),
"Get or set the repl behavior of lldb-dap evaluation requests.");
+ cmd.AddCommand("send-event", new SendEventRequestHandler(),
+ "Sends an DAP event to the client.");
g_dap.progress_event_thread = std::thread(ProgressEventThreadFunction);
diff --git a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
index 2bd8079..39d8b1d 100644
--- a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
+++ b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
@@ -16,12 +16,63 @@ std::string Render(std::vector<DiagnosticDetail> details) {
} // namespace
TEST_F(ErrorDisplayTest, RenderStatus) {
- DiagnosticDetail::SourceLocation inline_loc;
- inline_loc.in_user_input = true;
+ using SourceLocation = DiagnosticDetail::SourceLocation;
{
+ SourceLocation inline_loc;
+ inline_loc.in_user_input = true;
std::string result =
Render({DiagnosticDetail{inline_loc, eSeverityError, "foo", ""}});
ASSERT_TRUE(StringRef(result).contains("error:"));
ASSERT_TRUE(StringRef(result).contains("foo"));
}
+
+ {
+ // Test that diagnostics on the same column can be handled and all
+ // three errors are diagnosed.
+ SourceLocation loc1 = {FileSpec{"a.c"}, 13, 11, 0, false, true};
+ SourceLocation loc2 = {FileSpec{"a.c"}, 13, 13, 0, false, true};
+ std::string result =
+ Render({DiagnosticDetail{loc1, eSeverityError, "1", "1"},
+ DiagnosticDetail{loc1, eSeverityError, "2", "2"},
+ DiagnosticDetail{loc2, eSeverityError, "3", "3"}});
+ ASSERT_TRUE(StringRef(result).contains("error: 1"));
+ ASSERT_TRUE(StringRef(result).contains("error: 2"));
+ ASSERT_TRUE(StringRef(result).contains("error: 3"));
+ }
+ {
+ // Test that diagnostics in reverse order are emitted correctly.
+ SourceLocation loc1 = {FileSpec{"a.c"}, 1, 20, 0, false, true};
+ SourceLocation loc2 = {FileSpec{"a.c"}, 2, 10, 0, false, true};
+ std::string result =
+ Render({DiagnosticDetail{loc2, eSeverityError, "X", "X"},
+ DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}});
+ ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X"));
+ }
+ {
+ // Test that diagnostics in reverse order are emitted correctly.
+ SourceLocation loc1 = {FileSpec{"a.c"}, 2, 10, 0, false, true};
+ SourceLocation loc2 = {FileSpec{"a.c"}, 1, 20, 0, false, true};
+ std::string result =
+ Render({DiagnosticDetail{loc2, eSeverityError, "X", "X"},
+ DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}});
+ ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X"));
+ }
+ {
+ // Test that range diagnostics are emitted correctly.
+ SourceLocation loc1 = {FileSpec{"a.c"}, 1, 1, 3, false, true};
+ SourceLocation loc2 = {FileSpec{"a.c"}, 1, 5, 3, false, true};
+ std::string result =
+ Render({DiagnosticDetail{loc1, eSeverityError, "X", "X"},
+ DiagnosticDetail{loc2, eSeverityError, "Y", "Y"}});
+ auto lines = StringRef(result).split('\n');
+ auto line1 = lines.first;
+ lines = lines.second.split('\n');
+ auto line2 = lines.first;
+ lines = lines.second.split('\n');
+ auto line3 = lines.first;
+ // 1234567
+ ASSERT_EQ(line1, "^~~ ^~~");
+ ASSERT_EQ(line2, "| error: Y");
+ ASSERT_EQ(line3, "error: X");
+ }
}
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 57d1fa3..db3b5cd 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1998,6 +1998,9 @@ public:
case Intrinsic::atan:
ISD = ISD::FATAN;
break;
+ case Intrinsic::atan2:
+ ISD = ISD::FATAN2;
+ break;
case Intrinsic::sinh:
ISD = ISD::FSINH;
break;
diff --git a/llvm/include/llvm/CodeGen/EarlyIfConversion.h b/llvm/include/llvm/CodeGen/EarlyIfConversion.h
new file mode 100644
index 0000000..78bf12a
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/EarlyIfConversion.h
@@ -0,0 +1,24 @@
+//===- llvm/CodeGen/EarlyIfConversion.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EARLYIFCONVERSION_H
+#define LLVM_CODEGEN_EARLYIFCONVERSION_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class EarlyIfConverterPass : public PassInfoMixin<EarlyIfConverterPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_EARLYIFCONVERSION_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 82e713f..bcd44ab 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -599,11 +599,22 @@ public:
LegalizeRuleSet &legalFor(std::initializer_list<LLT> Types) {
return actionFor(LegalizeAction::Legal, Types);
}
+ LegalizeRuleSet &legalFor(bool Pred, std::initializer_list<LLT> Types) {
+ if (!Pred)
+ return *this;
+ return actionFor(LegalizeAction::Legal, Types);
+ }
/// The instruction is legal when type indexes 0 and 1 is any type pair in the
/// given list.
LegalizeRuleSet &legalFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
return actionFor(LegalizeAction::Legal, Types);
}
+ LegalizeRuleSet &legalFor(bool Pred,
+ std::initializer_list<std::pair<LLT, LLT>> Types) {
+ if (!Pred)
+ return *this;
+ return actionFor(LegalizeAction::Legal, Types);
+ }
/// The instruction is legal when type index 0 is any type in the given list
/// and imm index 0 is anything.
LegalizeRuleSet &legalForTypeWithAnyImm(std::initializer_list<LLT> Types) {
@@ -749,6 +760,12 @@ public:
return actionFor(LegalizeAction::Libcall, Types);
}
LegalizeRuleSet &
+ libcallFor(bool Pred, std::initializer_list<std::pair<LLT, LLT>> Types) {
+ if (!Pred)
+ return *this;
+ return actionFor(LegalizeAction::Libcall, Types);
+ }
+ LegalizeRuleSet &
libcallForCartesianProduct(std::initializer_list<LLT> Types) {
return actionForCartesianProduct(LegalizeAction::Libcall, Types);
}
@@ -846,12 +863,23 @@ public:
LegalizeRuleSet &customFor(std::initializer_list<LLT> Types) {
return actionFor(LegalizeAction::Custom, Types);
}
+ LegalizeRuleSet &customFor(bool Pred, std::initializer_list<LLT> Types) {
+ if (!Pred)
+ return *this;
+ return actionFor(LegalizeAction::Custom, Types);
+ }
- /// The instruction is custom when type indexes 0 and 1 is any type pair in the
- /// given list.
+ /// The instruction is custom when type indexes 0 and 1 is any type pair in
+ /// the given list.
LegalizeRuleSet &customFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
return actionFor(LegalizeAction::Custom, Types);
}
+ LegalizeRuleSet &customFor(bool Pred,
+ std::initializer_list<std::pair<LLT, LLT>> Types) {
+ if (!Pred)
+ return *this;
+ return actionFor(LegalizeAction::Custom, Types);
+ }
LegalizeRuleSet &customForCartesianProduct(std::initializer_list<LLT> Types) {
return actionForCartesianProduct(LegalizeAction::Custom, Types);
@@ -990,6 +1018,11 @@ public:
scalarNarrowerThan(TypeIdx, Ty.getSizeInBits()),
changeTo(typeIdx(TypeIdx), Ty));
}
+ LegalizeRuleSet &minScalar(bool Pred, unsigned TypeIdx, const LLT Ty) {
+ if (!Pred)
+ return *this;
+ return minScalar(TypeIdx, Ty);
+ }
/// Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet &minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx,
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index da43f5b..0b6d155 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -425,6 +425,7 @@ enum NodeType {
STRICT_FASIN,
STRICT_FACOS,
STRICT_FATAN,
+ STRICT_FATAN2,
STRICT_FSINH,
STRICT_FCOSH,
STRICT_FTANH,
@@ -994,6 +995,8 @@ enum NodeType {
FPOWI,
/// FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
FLDEXP,
+ /// FATAN2 - atan2, inspired by libm.
+ FATAN2,
/// FFREXP - frexp, extract fractional and exponent component of a
/// floating-point value. Returns the two components as separate return
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 4d93213..0f2898d 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -45,7 +45,7 @@ struct VRegInfo {
} D;
Register VReg;
Register PreferredReg;
- std::vector<uint8_t> Flags;
+ uint8_t Flags = 0;
};
using Name2RegClassMap = StringMap<const TargetRegisterClass *>;
diff --git a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
index c7d9759..d51de24 100644
--- a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
@@ -46,12 +46,13 @@
#ifndef LLVM_CODEGEN_MACHINETRACEMETRICS_H
#define LLVM_CODEGEN_MACHINETRACEMETRICS_H
-#include "llvm/ADT/SparseSet.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/TargetSchedule.h"
namespace llvm {
@@ -93,7 +94,7 @@ enum class MachineTraceStrategy {
TS_NumStrategies
};
-class MachineTraceMetrics : public MachineFunctionPass {
+class MachineTraceMetrics {
const MachineFunction *MF = nullptr;
const TargetInstrInfo *TII = nullptr;
const TargetRegisterInfo *TRI = nullptr;
@@ -102,19 +103,25 @@ class MachineTraceMetrics : public MachineFunctionPass {
TargetSchedModel SchedModel;
public:
+ friend class MachineTraceMetricsWrapperPass;
friend class Ensemble;
friend class Trace;
class Ensemble;
- static char ID;
+ // For legacy pass.
+ MachineTraceMetrics() = default;
+
+ explicit MachineTraceMetrics(MachineFunction &MF, const MachineLoopInfo &LI) {
+ init(MF, LI);
+ }
- MachineTraceMetrics();
+ MachineTraceMetrics(MachineTraceMetrics &&) = default;
- void getAnalysisUsage(AnalysisUsage&) const override;
- bool runOnMachineFunction(MachineFunction&) override;
- void releaseMemory() override;
- void verifyAnalysis() const override;
+ ~MachineTraceMetrics();
+
+ void init(MachineFunction &Func, const MachineLoopInfo &LI);
+ void clear();
/// Per-basic block information that doesn't depend on the trace through the
/// block.
@@ -400,6 +407,12 @@ public:
/// Call Ensemble::getTrace() again to update any trace handles.
void invalidate(const MachineBasicBlock *MBB);
+ /// Handle invalidation explicitly.
+ bool invalidate(MachineFunction &, const PreservedAnalyses &PA,
+ MachineFunctionAnalysisManager::Invalidator &);
+
+ void verifyAnalysis() const;
+
private:
// One entry per basic block, indexed by block number.
SmallVector<FixedBlockInfo, 4> BlockInfo;
@@ -412,8 +425,8 @@ private:
SmallVector<unsigned, 0> ProcReleaseAtCycles;
// One ensemble per strategy.
- Ensemble
- *Ensembles[static_cast<size_t>(MachineTraceStrategy::TS_NumStrategies)];
+ std::unique_ptr<Ensemble>
+ Ensembles[static_cast<size_t>(MachineTraceStrategy::TS_NumStrategies)];
// Convert scaled resource usage to a cycle count that can be compared with
// latencies.
@@ -435,6 +448,38 @@ inline raw_ostream &operator<<(raw_ostream &OS,
return OS;
}
+class MachineTraceMetricsAnalysis
+ : public AnalysisInfoMixin<MachineTraceMetricsAnalysis> {
+ friend AnalysisInfoMixin<MachineTraceMetricsAnalysis>;
+ static AnalysisKey Key;
+
+public:
+ using Result = MachineTraceMetrics;
+ Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+};
+
+/// Verifier pass for \c MachineTraceMetrics.
+struct MachineTraceMetricsVerifierPass
+ : PassInfoMixin<MachineTraceMetricsVerifierPass> {
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+ static bool isRequired() { return true; }
+};
+
+class MachineTraceMetricsWrapperPass : public MachineFunctionPass {
+public:
+ static char ID;
+ MachineTraceMetrics MTM;
+
+ MachineTraceMetricsWrapperPass();
+
+ void getAnalysisUsage(AnalysisUsage &) const override;
+ bool runOnMachineFunction(MachineFunction &) override;
+ void releaseMemory() override { MTM.clear(); }
+ void verifyAnalysis() const override { MTM.verifyAnalysis(); }
+ MachineTraceMetrics &getMTM() { return MTM; }
+};
+
} // end namespace llvm
#endif // LLVM_CODEGEN_MACHINETRACEMETRICS_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 99421bd..bbbf996 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -273,7 +273,7 @@ namespace llvm {
/// EarlyIfConverter - This pass performs if-conversion on SSA form by
/// inserting cmov instructions.
- extern char &EarlyIfConverterID;
+ extern char &EarlyIfConverterLegacyID;
/// EarlyIfPredicator - This pass performs if-conversion on SSA form by
/// predicating if/else block and insert select at the join point.
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 0af4f73b..b3e249b 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -760,6 +760,16 @@ inline BinaryOpc_match<LHS, RHS> m_Srl(const LHS &L, const RHS &R) {
}
template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS> m_Rotl(const LHS &L, const RHS &R) {
+ return BinaryOpc_match<LHS, RHS>(ISD::ROTL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS> m_Rotr(const LHS &L, const RHS &R) {
+ return BinaryOpc_match<LHS, RHS>(ISD::ROTR, L, R);
+}
+
+template <typename LHS, typename RHS>
inline BinaryOpc_match<LHS, RHS, true> m_FAdd(const LHS &L, const RHS &R) {
return BinaryOpc_match<LHS, RHS, true>(ISD::FADD, L, R);
}
@@ -823,6 +833,11 @@ inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
}
template <typename Opnd>
+inline UnaryOpc_match<Opnd> m_BSwap(const Opnd &Op) {
+ return UnaryOpc_match<Opnd>(ISD::BSWAP, Op);
+}
+
+template <typename Opnd>
inline UnaryOpc_match<Opnd> m_BitReverse(const Opnd &Op) {
return UnaryOpc_match<Opnd>(ISD::BITREVERSE, Op);
}
@@ -892,10 +907,18 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_FPToSI(const Opnd &Op) {
return UnaryOpc_match<Opnd>(ISD::FP_TO_SINT, Op);
}
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Ctpop(const Opnd &Op) {
+ return UnaryOpc_match<Opnd>(ISD::CTPOP, Op);
+}
+
template <typename Opnd> inline UnaryOpc_match<Opnd> m_Ctlz(const Opnd &Op) {
return UnaryOpc_match<Opnd>(ISD::CTLZ, Op);
}
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Cttz(const Opnd &Op) {
+ return UnaryOpc_match<Opnd>(ISD::CTTZ, Op);
+}
+
// === Constants ===
struct ConstantInt_match {
APInt *BindVal;
diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def
index 56304c3..30a82bf 100644
--- a/llvm/include/llvm/IR/ConstrainedOps.def
+++ b/llvm/include/llvm/IR/ConstrainedOps.def
@@ -72,6 +72,7 @@ CMP_INSTRUCTION(FCmp, 2, 0, experimental_constrained_fcmps, FSETCCS
DAG_FUNCTION(acos, 1, 1, experimental_constrained_acos, FACOS)
DAG_FUNCTION(asin, 1, 1, experimental_constrained_asin, FASIN)
DAG_FUNCTION(atan, 1, 1, experimental_constrained_atan, FATAN)
+DAG_FUNCTION(atan2, 2, 1, experimental_constrained_atan2, FATAN2)
DAG_FUNCTION(ceil, 1, 0, experimental_constrained_ceil, FCEIL)
DAG_FUNCTION(cos, 1, 1, experimental_constrained_cos, FCOS)
DAG_FUNCTION(cosh, 1, 1, experimental_constrained_cosh, FCOSH)
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 49f4fe4..e893295 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -102,6 +102,16 @@ namespace Intrinsic {
inline Function *getDeclaration(Module *M, ID id, ArrayRef<Type *> Tys = {}) {
return getOrInsertDeclaration(M, id, Tys);
}
+
+ /// Look up the Function declaration of the intrinsic \p id in the Module
+ /// \p M and return it if it exists. Otherwise, return nullptr. This version
+ /// supports non-overloaded intrinsics.
+ Function *getDeclarationIfExists(const Module *M, ID id);
+
+ /// This version supports overloaded intrinsics.
+ Function *getDeclarationIfExists(Module *M, ID id, ArrayRef<Type *> Tys,
+ FunctionType *FT = nullptr);
+
/// Looks up Name in NameTable via binary search. NameTable must be sorted
/// and all entries must start with "llvm.". If NameTable contains an exact
/// match for Name or a prefix of Name followed by a dot, its index in
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8a0721c..94e53f3 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1235,6 +1235,11 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn, IntrStrictFP] in
[ LLVMMatchType<0>,
llvm_metadata_ty,
llvm_metadata_ty ]>;
+ def int_experimental_constrained_atan2 : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
+ [ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ llvm_metadata_ty,
+ llvm_metadata_ty ]>;
def int_experimental_constrained_sin : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
[ LLVMMatchType<0>,
llvm_metadata_ty,
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index 69cf431..4aab658 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -232,6 +232,11 @@ HANDLE_LIBCALL(ATAN_F64, "atan")
HANDLE_LIBCALL(ATAN_F80, "atanl")
HANDLE_LIBCALL(ATAN_F128,"atanl")
HANDLE_LIBCALL(ATAN_PPCF128, "atanl")
+HANDLE_LIBCALL(ATAN2_F32, "atan2f")
+HANDLE_LIBCALL(ATAN2_F64, "atan2")
+HANDLE_LIBCALL(ATAN2_F80, "atan2l")
+HANDLE_LIBCALL(ATAN2_F128,"atan2l")
+HANDLE_LIBCALL(ATAN2_PPCF128, "atan2l")
HANDLE_LIBCALL(SINCOS_F32, nullptr)
HANDLE_LIBCALL(SINCOS_F64, nullptr)
HANDLE_LIBCALL(SINCOS_F80, nullptr)
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 6a75dc0..1374880 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -98,7 +98,7 @@ void initializeDominatorTreeWrapperPassPass(PassRegistry &);
void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
void initializeEarlyCSELegacyPassPass(PassRegistry &);
void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
-void initializeEarlyIfConverterPass(PassRegistry &);
+void initializeEarlyIfConverterLegacyPass(PassRegistry &);
void initializeEarlyIfPredicatorPass(PassRegistry &);
void initializeEarlyMachineLICMPass(PassRegistry &);
void initializeEarlyTailDuplicatePass(PassRegistry &);
@@ -209,7 +209,7 @@ void initializeMachineRegionInfoPassPass(PassRegistry &);
void initializeMachineSanitizerBinaryMetadataPass(PassRegistry &);
void initializeMachineSchedulerPass(PassRegistry &);
void initializeMachineSinkingPass(PassRegistry &);
-void initializeMachineTraceMetricsPass(PassRegistry &);
+void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
void initializeMachineVerifierLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 0d45df0..9ef6e39 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -27,6 +27,7 @@
#include "llvm/CodeGen/CodeGenPrepare.h"
#include "llvm/CodeGen/DeadMachineInstructionElim.h"
#include "llvm/CodeGen/DwarfEHPrepare.h"
+#include "llvm/CodeGen/EarlyIfConversion.h"
#include "llvm/CodeGen/ExpandLargeDivRem.h"
#include "llvm/CodeGen/ExpandLargeFpConvert.h"
#include "llvm/CodeGen/ExpandMemCmp.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 2aa5f4f..4e44d03 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -106,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("machine-opt-remark-emitter",
MachineOptimizationRemarkEmitterAnalysis())
MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree",
MachinePostDominatorTreeAnalysis())
+MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis())
MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
// MACHINE_FUNCTION_ANALYSIS("live-stacks", LiveStacksPass())
@@ -119,8 +120,6 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
// MachinePostDominatorTreeAnalysis())
// MACHINE_FUNCTION_ANALYSIS("machine-region-info",
// MachineRegionInfoPassAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics",
-// MachineTraceMetricsAnalysis()) MACHINE_FUNCTION_ANALYSIS("reaching-def",
// ReachingDefAnalysisAnalysis()) MACHINE_FUNCTION_ANALYSIS("live-reg-matrix",
// LiveRegMatrixAnalysis()) MACHINE_FUNCTION_ANALYSIS("gc-analysis",
// GCMachineCodeAnalysisPass())
@@ -130,6 +129,7 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass())
+MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass())
MACHINE_FUNCTION_PASS("early-machinelicm", EarlyMachineLICMPass())
MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass())
MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass())
@@ -156,6 +156,7 @@ MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass())
MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
MACHINE_FUNCTION_PASS("two-address-instruction", TwoAddressInstructionPass())
MACHINE_FUNCTION_PASS("verify", MachineVerifierPass())
+MACHINE_FUNCTION_PASS("verify<machine-trace-metrics>", MachineTraceMetricsVerifierPass())
#undef MACHINE_FUNCTION_PASS
#ifndef MACHINE_FUNCTION_PASS_WITH_PARAMS
@@ -205,7 +206,6 @@ DUMMY_MACHINE_FUNCTION_PASS("cfi-fixup", CFIFixupPass)
DUMMY_MACHINE_FUNCTION_PASS("cfi-instr-inserter", CFIInstrInserterPass)
DUMMY_MACHINE_FUNCTION_PASS("detect-dead-lanes", DetectDeadLanesPass)
DUMMY_MACHINE_FUNCTION_PASS("dot-machine-cfg", MachineCFGPrinter)
-DUMMY_MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass)
DUMMY_MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass)
DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass)
DUMMY_MACHINE_FUNCTION_PASS("fixup-statepoint-caller-saved", FixupStatepointCallerSavedPass)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index adf8a75..fa516fc 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -534,6 +534,7 @@ def ftan : SDNode<"ISD::FTAN" , SDTFPUnaryOp>;
def fasin : SDNode<"ISD::FASIN" , SDTFPUnaryOp>;
def facos : SDNode<"ISD::FACOS" , SDTFPUnaryOp>;
def fatan : SDNode<"ISD::FATAN" , SDTFPUnaryOp>;
+def fatan2 : SDNode<"ISD::FATAN2" , SDTFPBinOp>;
def fsinh : SDNode<"ISD::FSINH" , SDTFPUnaryOp>;
def fcosh : SDNode<"ISD::FCOSH" , SDTFPUnaryOp>;
def ftanh : SDNode<"ISD::FTANH" , SDTFPUnaryOp>;
@@ -602,6 +603,8 @@ def strict_facos : SDNode<"ISD::STRICT_FACOS",
SDTFPUnaryOp, [SDNPHasChain]>;
def strict_fatan : SDNode<"ISD::STRICT_FATAN",
SDTFPUnaryOp, [SDNPHasChain]>;
+def strict_fatan2 : SDNode<"ISD::STRICT_FATAN2",
+ SDTFPBinOp, [SDNPHasChain]>;
def strict_fsinh : SDNode<"ISD::STRICT_FSINH",
SDTFPUnaryOp, [SDNPHasChain]>;
def strict_fcosh : SDNode<"ISD::STRICT_FCOSH",
@@ -1588,6 +1591,9 @@ def any_facos : PatFrags<(ops node:$src),
def any_fatan : PatFrags<(ops node:$src),
[(strict_fatan node:$src),
(fatan node:$src)]>;
+def any_fatan2 : PatFrags<(ops node:$src1, node:$src2),
+ [(strict_fatan2 node:$src1, node:$src2),
+ (fatan2 node:$src1, node:$src2)]>;
def any_fsinh : PatFrags<(ops node:$src),
[(strict_fsinh node:$src),
(fsinh node:$src)]>;
diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
index 076d91a..4e757b2 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
@@ -201,9 +201,7 @@ private:
void UpdateWithSalvagedProfiles();
LocToLocMap &getIRToProfileLocationMap(const Function &F) {
- auto Ret = FuncMappings.try_emplace(
- FunctionSamples::getCanonicalFnName(F.getName()), LocToLocMap());
- return Ret.first->second;
+ return FuncMappings[FunctionSamples::getCanonicalFnName(F.getName())];
}
void distributeIRToProfileLocationMap();
void distributeIRToProfileLocationMap(FunctionSamples &FS);
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
index 6bad38b..a451286 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
@@ -279,6 +279,7 @@ public:
unsigned size() const { return Bundles.size(); }
#ifndef NDEBUG
+ void print(raw_ostream &OS) const;
LLVM_DUMP_METHOD void dump() const;
#endif // NDEBUG
};
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 30dc4ae..10ad470 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1613,7 +1613,7 @@ LazyValueInfoImpl &LazyValueInfo::getOrCreateImpl(const Module *M) {
assert(M && "getCache() called with a null Module");
const DataLayout &DL = M->getDataLayout();
Function *GuardDecl =
- M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+ Intrinsic::getDeclarationIfExists(M, Intrinsic::experimental_guard);
PImpl = new LazyValueInfoImpl(AC, DL, GuardDecl);
}
return *static_cast<LazyValueInfoImpl *>(PImpl);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 97ea405..a3ba8e0 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -11665,8 +11665,8 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
}
// Check conditions due to any @llvm.experimental.guard intrinsics.
- auto *GuardDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::experimental_guard);
if (GuardDecl)
for (const auto *GU : GuardDecl->users())
if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
@@ -13615,8 +13615,8 @@ ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI,
// ScalarEvolution to optimize based on those guards. For now we prefer to be
// efficient in lieu of being smart in that rather obscure case.
- auto *GuardDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::experimental_guard);
HasGuards = GuardDecl && !GuardDecl->use_empty();
}
@@ -15593,8 +15593,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
}
// Second, collect information from llvm.experimental.guards dominating the loop.
- auto *GuardDecl = SE.F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ SE.F.getParent(), Intrinsic::experimental_guard);
if (GuardDecl)
for (const auto *GU : GuardDecl->users())
if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index eb8d170..e9ed8b3 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8522,6 +8522,10 @@ bool llvm::isKnownInversion(const Value *X, const Value *Y) {
!match(Y, m_c_ICmp(Pred2, m_Specific(A), m_Value(C))))
return false;
+ // They must both have samesign flag or not.
+ if (cast<ICmpInst>(X)->hasSameSign() != cast<ICmpInst>(Y)->hasSameSign())
+ return false;
+
if (B == C)
return Pred1 == ICmpInst::getInversePredicate(Pred2);
@@ -8530,6 +8534,11 @@ bool llvm::isKnownInversion(const Value *X, const Value *Y) {
if (!match(B, m_APInt(RHSC1)) || !match(C, m_APInt(RHSC2)))
return false;
+ // Sign bits of two RHSCs should match.
+ if (cast<ICmpInst>(X)->hasSameSign() &&
+ RHSC1->isNonNegative() != RHSC2->isNonNegative())
+ return false;
+
const auto CR1 = ConstantRange::makeExactICmpRegion(Pred1, *RHSC1);
const auto CR2 = ConstantRange::makeExactICmpRegion(Pred2, *RHSC2);
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index dbffbb8..6b5251e 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1414,7 +1414,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
int Index,
- std::string FirstOrLast) -> bool {
+ const char *FirstOrLast) -> bool {
Instruction *Member = Group->getMember(Index);
assert(Member && "Group member does not exist");
Value *MemberPtr = getLoadStorePointerOperand(Member);
@@ -1455,11 +1455,10 @@ void InterleavedAccessInfo::analyzeInterleaving(
// So we check only group member 0 (which is always guaranteed to exist),
// and group member Factor - 1; If the latter doesn't exist we rely on
// peeling (if it is a non-reversed access -- see Case 3).
- if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
+ if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
continue;
if (Group->getMember(Group->getFactor() - 1))
- InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1,
- std::string("last"));
+ InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1, "last");
else {
// Case 3: A non-reversed interleaved load group with gaps: We need
// to execute at least one scalar epilogue iteration. This will ensure
@@ -1503,11 +1502,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
// and the last group member. Case 3 (scalar epilog) is not relevant for
// stores with gaps, which are implemented with masked-store (rather than
// speculative access, as in loads).
- if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
+ if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
continue;
for (int Index = Group->getFactor() - 1; Index > 0; Index--)
if (Group->getMember(Index)) {
- InvalidateGroupIfMemberMayWrap(Group, Index, std::string("last"));
+ InvalidateGroupIfMemberMayWrap(Group, Index, "last");
break;
}
}
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 48cc21ee..2d7f351 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -35,7 +35,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeDebugifyMachineModulePass(Registry);
initializeDetectDeadLanesPass(Registry);
initializeDwarfEHPrepareLegacyPassPass(Registry);
- initializeEarlyIfConverterPass(Registry);
+ initializeEarlyIfConverterLegacyPass(Registry);
initializeEarlyIfPredicatorPass(Registry);
initializeEarlyMachineLICMPass(Registry);
initializeEarlyTailDuplicatePass(Registry);
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 8d9813e..3e73995 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -15,6 +15,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/EarlyIfConversion.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -760,7 +761,7 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemoveBlocks,
//===----------------------------------------------------------------------===//
namespace {
-class EarlyIfConverter : public MachineFunctionPass {
+class EarlyIfConverter {
const TargetInstrInfo *TII = nullptr;
const TargetRegisterInfo *TRI = nullptr;
MCSchedModel SchedModel;
@@ -772,38 +773,48 @@ class EarlyIfConverter : public MachineFunctionPass {
SSAIfConv IfConv;
public:
- static char ID;
- EarlyIfConverter() : MachineFunctionPass(ID) {}
- void getAnalysisUsage(AnalysisUsage &AU) const override;
- bool runOnMachineFunction(MachineFunction &MF) override;
- StringRef getPassName() const override { return "Early If-Conversion"; }
+ EarlyIfConverter(MachineDominatorTree &DT, MachineLoopInfo &LI,
+ MachineTraceMetrics &MTM)
+ : DomTree(&DT), Loops(&LI), Traces(&MTM) {}
+ EarlyIfConverter() = delete;
+
+ bool run(MachineFunction &MF);
private:
bool tryConvertIf(MachineBasicBlock *);
void invalidateTraces();
bool shouldConvertIf();
};
+
+class EarlyIfConverterLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+ EarlyIfConverterLegacy() : MachineFunctionPass(ID) {}
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override { return "Early If-Conversion"; }
+};
} // end anonymous namespace
-char EarlyIfConverter::ID = 0;
-char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+char EarlyIfConverterLegacy::ID = 0;
+char &llvm::EarlyIfConverterLegacyID = EarlyIfConverterLegacy::ID;
-INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
- "Early If Converter", false, false)
+INITIALIZE_PASS_BEGIN(EarlyIfConverterLegacy, DEBUG_TYPE, "Early If Converter",
+ false, false)
INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
- "Early If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
+INITIALIZE_PASS_END(EarlyIfConverterLegacy, DEBUG_TYPE, "Early If Converter",
+ false, false)
-void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+void EarlyIfConverterLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfoWrapperPass>();
AU.addPreserved<MachineLoopInfoWrapperPass>();
- AU.addRequired<MachineTraceMetrics>();
- AU.addPreserved<MachineTraceMetrics>();
+ AU.addRequired<MachineTraceMetricsWrapperPass>();
+ AU.addPreserved<MachineTraceMetricsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -1076,11 +1087,9 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
return Changed;
}
-bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+bool EarlyIfConverter::run(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
<< "********** Function: " << MF.getName() << '\n');
- if (skipFunction(MF.getFunction()))
- return false;
// Only run if conversion if the target wants it.
const TargetSubtargetInfo &STI = MF.getSubtarget();
@@ -1091,9 +1100,6 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
TRI = STI.getRegisterInfo();
SchedModel = STI.getSchedModel();
MRI = &MF.getRegInfo();
- DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
- Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
- Traces = &getAnalysis<MachineTraceMetrics>();
MinInstr = nullptr;
bool Changed = false;
@@ -1110,6 +1116,41 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
return Changed;
}
+PreservedAnalyses
+EarlyIfConverterPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ if (MF.getFunction().hasOptNone())
+ return PreservedAnalyses::all();
+
+ MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
+ MachineLoopInfo &LI = MFAM.getResult<MachineLoopAnalysis>(MF);
+ MachineTraceMetrics &MTM = MFAM.getResult<MachineTraceMetricsAnalysis>(MF);
+
+ EarlyIfConverter Impl(MDT, LI, MTM);
+ bool Changed = Impl.run(MF);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ auto PA = getMachineFunctionPassPreservedAnalyses();
+ PA.preserve<MachineDominatorTreeAnalysis>();
+ PA.preserve<MachineLoopAnalysis>();
+ PA.preserve<MachineTraceMetricsAnalysis>();
+ return PA;
+}
+
+bool EarlyIfConverterLegacy::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ MachineDominatorTree &MDT =
+ getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MachineLoopInfo &LI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ MachineTraceMetrics &MTM =
+ getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
+
+ return EarlyIfConverter(MDT, LI, MTM).run(MF);
+}
+
//===----------------------------------------------------------------------===//
// EarlyIfPredicator Pass
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 14e94d4..f9b1621 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -178,7 +178,7 @@ void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
if (MRI.constrainRegAttrs(ToReg, FromReg))
MRI.replaceRegWith(FromReg, ToReg);
else
- Builder.buildCopy(ToReg, FromReg);
+ Builder.buildCopy(FromReg, ToReg);
Observer.finishedChangingAllUsesOfReg();
}
@@ -229,8 +229,8 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
- MI.eraseFromParent();
replaceRegWith(MRI, DstReg, SrcReg);
+ MI.eraseFromParent();
}
bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
@@ -379,8 +379,8 @@ void CombinerHelper::applyCombineConcatVectors(MachineInstr &MI,
Builder.buildUndef(NewDstReg);
else
Builder.buildBuildVector(NewDstReg, Ops);
- MI.eraseFromParent();
replaceRegWith(MRI, DstReg, NewDstReg);
+ MI.eraseFromParent();
}
bool CombinerHelper::matchCombineShuffleConcat(MachineInstr &MI,
@@ -559,8 +559,8 @@ void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
else
Builder.buildMergeLikeInstr(NewDstReg, Ops);
- MI.eraseFromParent();
replaceRegWith(MRI, DstReg, NewDstReg);
+ MI.eraseFromParent();
}
bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) {
@@ -2825,8 +2825,8 @@ void CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
Register OldReg = MI.getOperand(0).getReg();
Register Replacement = MI.getOperand(OpIdx).getReg();
assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
- MI.eraseFromParent();
replaceRegWith(MRI, OldReg, Replacement);
+ MI.eraseFromParent();
}
void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
@@ -2834,8 +2834,8 @@ void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
Register OldReg = MI.getOperand(0).getReg();
assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
- MI.eraseFromParent();
replaceRegWith(MRI, OldReg, Replacement);
+ MI.eraseFromParent();
}
bool CombinerHelper::matchConstantLargerBitWidth(MachineInstr &MI,
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 10d3cdc..c0c61b3 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -703,7 +703,7 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
return error(FlagStringValue.SourceRange.Start,
Twine("use of undefined register flag '") +
FlagStringValue.Value + "'");
- Info.Flags.push_back(FlagValue);
+ Info.Flags |= FlagValue;
}
RegInfo.noteNewVirtualRegister(Info.VReg);
}
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 1a19e05..5bfc1d6 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -133,7 +133,7 @@ char &llvm::MachineCombinerID = MachineCombiner::ID;
INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
"Machine InstCombiner", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
false, false)
@@ -142,8 +142,8 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfoWrapperPass>();
AU.addPreserved<MachineLoopInfoWrapperPass>();
- AU.addRequired<MachineTraceMetrics>();
- AU.addPreserved<MachineTraceMetrics>();
+ AU.addRequired<MachineTraceMetricsWrapperPass>();
+ AU.addPreserved<MachineTraceMetricsWrapperPass>();
AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -727,7 +727,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
TSchedModel.init(STI);
MRI = &MF.getRegInfo();
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
- Traces = &getAnalysis<MachineTraceMetrics>();
+ Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
MBFI = (PSI && PSI->hasProfileSummary()) ?
&getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index bf3add0..92df6b9 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -14,7 +14,6 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/SparseSet.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
@@ -40,49 +39,66 @@ using namespace llvm;
#define DEBUG_TYPE "machine-trace-metrics"
-char MachineTraceMetrics::ID = 0;
+AnalysisKey MachineTraceMetricsAnalysis::Key;
-char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
+MachineTraceMetricsAnalysis::Result
+MachineTraceMetricsAnalysis::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ return Result(MF, MFAM.getResult<MachineLoopAnalysis>(MF));
+}
+
+PreservedAnalyses
+MachineTraceMetricsVerifierPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ MFAM.getResult<MachineTraceMetricsAnalysis>(MF).verifyAnalysis();
+ return PreservedAnalyses::all();
+}
-INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE,
+char MachineTraceMetricsWrapperPass::ID = 0;
+
+char &llvm::MachineTraceMetricsID = MachineTraceMetricsWrapperPass::ID;
+
+INITIALIZE_PASS_BEGIN(MachineTraceMetricsWrapperPass, DEBUG_TYPE,
"Machine Trace Metrics", false, true)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
-INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
+INITIALIZE_PASS_END(MachineTraceMetricsWrapperPass, DEBUG_TYPE,
"Machine Trace Metrics", false, true)
-MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
- std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
-}
+MachineTraceMetricsWrapperPass::MachineTraceMetricsWrapperPass()
+ : MachineFunctionPass(ID) {}
-void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
+void MachineTraceMetricsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
- AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
AU.addRequired<MachineLoopInfoWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
-bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
+void MachineTraceMetrics::init(MachineFunction &Func,
+ const MachineLoopInfo &LI) {
MF = &Func;
const TargetSubtargetInfo &ST = MF->getSubtarget();
TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
MRI = &MF->getRegInfo();
- Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ Loops = &LI;
SchedModel.init(&ST);
BlockInfo.resize(MF->getNumBlockIDs());
ProcReleaseAtCycles.resize(MF->getNumBlockIDs() *
SchedModel.getNumProcResourceKinds());
+}
+
+bool MachineTraceMetricsWrapperPass::runOnMachineFunction(MachineFunction &MF) {
+ MTM.init(MF, getAnalysis<MachineLoopInfoWrapperPass>().getLI());
return false;
}
-void MachineTraceMetrics::releaseMemory() {
+MachineTraceMetrics::~MachineTraceMetrics() { clear(); }
+
+void MachineTraceMetrics::clear() {
MF = nullptr;
BlockInfo.clear();
- for (Ensemble *&E : Ensembles) {
- delete E;
- E = nullptr;
- }
+ for (auto &E : Ensembles)
+ E.reset();
}
//===----------------------------------------------------------------------===//
@@ -398,35 +414,50 @@ MachineTraceMetrics::Ensemble *
MachineTraceMetrics::getEnsemble(MachineTraceStrategy strategy) {
assert(strategy < MachineTraceStrategy::TS_NumStrategies &&
"Invalid trace strategy enum");
- Ensemble *&E = Ensembles[static_cast<size_t>(strategy)];
+ std::unique_ptr<MachineTraceMetrics::Ensemble> &E =
+ Ensembles[static_cast<size_t>(strategy)];
if (E)
- return E;
+ return E.get();
// Allocate new Ensemble on demand.
switch (strategy) {
case MachineTraceStrategy::TS_MinInstrCount:
- return (E = new MinInstrCountEnsemble(this));
+ E = std::make_unique<MinInstrCountEnsemble>(MinInstrCountEnsemble(this));
+ break;
case MachineTraceStrategy::TS_Local:
- return (E = new LocalEnsemble(this));
+ E = std::make_unique<LocalEnsemble>(LocalEnsemble(this));
+ break;
default: llvm_unreachable("Invalid trace strategy enum");
}
+ return E.get();
}
void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
LLVM_DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB)
<< '\n');
BlockInfo[MBB->getNumber()].invalidate();
- for (Ensemble *E : Ensembles)
+ for (auto &E : Ensembles)
if (E)
E->invalidate(MBB);
}
+bool MachineTraceMetrics::invalidate(
+ MachineFunction &, const PreservedAnalyses &PA,
+ MachineFunctionAnalysisManager::Invalidator &) {
+ // Check whether the analysis, all analyses on machine functions, or the
+ // machine function's CFG have been preserved.
+ auto PAC = PA.getChecker<MachineTraceMetricsAnalysis>();
+ return !PAC.preserved() &&
+ !PAC.preservedSet<AllAnalysesOn<MachineFunction>>() &&
+ !PAC.preservedSet<CFGAnalyses>();
+}
+
void MachineTraceMetrics::verifyAnalysis() const {
if (!MF)
return;
#ifndef NDEBUG
assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
- for (Ensemble *E : Ensembles)
+ for (auto &E : Ensembles)
if (E)
E->verify();
#endif
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index a50909a..ad2037a 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -368,8 +368,7 @@ Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
if (!StackGuardVar) {
TL.insertSSPDeclarations(*M);
- return IRB.CreateCall(
- Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard));
+ return IRB.CreateIntrinsic(Intrinsic::stackguard, {}, {});
}
return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard");
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 608ee85..ca91d35 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9594,6 +9594,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
}
// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
+ // fold (not (and x, y)) -> (or (not x), (not y)) iff x or y are setcc
if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
(N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
@@ -22567,7 +22568,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
return SDValue();
ISD::LoadExtType ExtTy =
- ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
+ ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD;
if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ea22b46..e0a0338 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4600,6 +4600,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
ExpandFPLibCall(Node, RTLIB::ATAN_F32, RTLIB::ATAN_F64, RTLIB::ATAN_F80,
RTLIB::ATAN_F128, RTLIB::ATAN_PPCF128, Results);
break;
+ case ISD::FATAN2:
+ case ISD::STRICT_FATAN2:
+ ExpandFPLibCall(Node, RTLIB::ATAN2_F32, RTLIB::ATAN2_F64, RTLIB::ATAN2_F80,
+ RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128, Results);
+ break;
case ISD::FSINH:
case ISD::STRICT_FSINH:
ExpandFPLibCall(Node, RTLIB::SINH_F32, RTLIB::SINH_F64, RTLIB::SINH_F80,
@@ -5486,6 +5491,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
case ISD::FMINIMUMNUM:
case ISD::FMAXIMUMNUM:
case ISD::FPOW:
+ case ISD::FATAN2:
Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2,
@@ -5502,6 +5508,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
case ISD::STRICT_FMAXNUM:
case ISD::STRICT_FREM:
case ISD::STRICT_FPOW:
+ case ISD::STRICT_FATAN2:
Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
{Node->getOperand(0), Node->getOperand(1)});
Tmp2 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2c81c82..73c258f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -84,6 +84,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FASIN: R = SoftenFloatRes_FASIN(N); break;
case ISD::STRICT_FATAN:
case ISD::FATAN: R = SoftenFloatRes_FATAN(N); break;
+ case ISD::STRICT_FATAN2:
+ case ISD::FATAN2: R = SoftenFloatRes_FATAN2(N); break;
case ISD::FCBRT: R = SoftenFloatRes_FCBRT(N); break;
case ISD::STRICT_FCEIL:
case ISD::FCEIL: R = SoftenFloatRes_FCEIL(N); break;
@@ -366,6 +368,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FATAN(SDNode *N) {
RTLIB::ATAN_F80, RTLIB::ATAN_F128, RTLIB::ATAN_PPCF128));
}
+SDValue DAGTypeLegalizer::SoftenFloatRes_FATAN2(SDNode *N) {
+ return SoftenFloatRes_Binary(
+ N,
+ GetFPLibCall(N->getValueType(0), RTLIB::ATAN2_F32, RTLIB::ATAN2_F64,
+ RTLIB::ATAN2_F80, RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128));
+}
+
SDValue DAGTypeLegalizer::SoftenFloatRes_FCBRT(SDNode *N) {
return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
RTLIB::CBRT_F32,
@@ -1430,6 +1439,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FASIN: ExpandFloatRes_FASIN(N, Lo, Hi); break;
case ISD::STRICT_FATAN:
case ISD::FATAN: ExpandFloatRes_FATAN(N, Lo, Hi); break;
+ case ISD::STRICT_FATAN2:
+ case ISD::FATAN2: ExpandFloatRes_FATAN2(N, Lo, Hi); break;
case ISD::FCBRT: ExpandFloatRes_FCBRT(N, Lo, Hi); break;
case ISD::STRICT_FCEIL:
case ISD::FCEIL: ExpandFloatRes_FCEIL(N, Lo, Hi); break;
@@ -1631,6 +1642,15 @@ void DAGTypeLegalizer::ExpandFloatRes_FATAN(SDNode *N, SDValue &Lo,
Lo, Hi);
}
+void DAGTypeLegalizer::ExpandFloatRes_FATAN2(SDNode *N, SDValue &Lo,
+ SDValue &Hi) {
+ ExpandFloatRes_Binary(N,
+ GetFPLibCall(N->getValueType(0), RTLIB::ATAN2_F32,
+ RTLIB::ATAN2_F64, RTLIB::ATAN2_F80,
+ RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128),
+ Lo, Hi);
+}
+
void DAGTypeLegalizer::ExpandFloatRes_FCBRT(SDNode *N, SDValue &Lo,
SDValue &Hi) {
ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), RTLIB::CBRT_F32,
@@ -2673,6 +2693,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FMINNUM_IEEE:
case ISD::FMUL:
case ISD::FPOW:
+ case ISD::FATAN2:
case ISD::FREM:
case ISD::FSUB: R = PromoteFloatRes_BinOp(N); break;
@@ -3115,6 +3136,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
case ISD::FMINNUM:
case ISD::FMUL:
case ISD::FPOW:
+ case ISD::FATAN2:
case ISD::FREM:
case ISD::FSUB: R = SoftPromoteHalfRes_BinOp(N); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d14516e..868da25 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -567,6 +567,7 @@ private:
SDValue SoftenFloatRes_FACOS(SDNode *N);
SDValue SoftenFloatRes_FASIN(SDNode *N);
SDValue SoftenFloatRes_FATAN(SDNode *N);
+ SDValue SoftenFloatRes_FATAN2(SDNode *N);
SDValue SoftenFloatRes_FMINNUM(SDNode *N);
SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
SDValue SoftenFloatRes_FMINIMUMNUM(SDNode *N);
@@ -661,6 +662,7 @@ private:
void ExpandFloatRes_FACOS (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FASIN (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FATAN (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandFloatRes_FATAN2 (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FMINNUM (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FMAXNUM (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FMINIMUMNUM(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ffecca78a..a8042fc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -410,6 +410,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::FASIN:
case ISD::FACOS:
case ISD::FATAN:
+ case ISD::FATAN2:
case ISD::FSINH:
case ISD::FCOSH:
case ISD::FTANH:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index e0b47e1..50e2a92 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -164,6 +164,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::USHLSAT:
case ISD::FPOW:
+ case ISD::FATAN2:
case ISD::FREM:
case ISD::FSUB:
case ISD::MUL:
@@ -1293,6 +1294,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::UDIV: case ISD::VP_UDIV:
case ISD::FDIV: case ISD::VP_FDIV:
case ISD::FPOW:
+ case ISD::FATAN2:
case ISD::AND: case ISD::VP_AND:
case ISD::OR: case ISD::VP_OR:
case ISD::XOR: case ISD::VP_XOR:
@@ -4581,6 +4583,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
break;
case ISD::FPOW:
+ case ISD::FATAN2:
case ISD::FREM:
if (unrollExpandedOp())
break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ff4b2f4..d63ed7e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5471,6 +5471,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
case ISD::FASIN:
case ISD::FACOS:
case ISD::FATAN:
+ case ISD::FATAN2:
case ISD::FSINH:
case ISD::FCOSH:
case ISD::FTANH:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 805b8ec..9d82247 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6861,6 +6861,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
getValue(I.getArgOperand(0)), Flags));
return;
}
+ case Intrinsic::atan2:
+ setValue(&I, DAG.getNode(ISD::FATAN2, sdl,
+ getValue(I.getArgOperand(0)).getValueType(),
+ getValue(I.getArgOperand(0)),
+ getValue(I.getArgOperand(1)), Flags));
+ return;
case Intrinsic::lround:
case Intrinsic::llround:
case Intrinsic::lrint:
@@ -9353,6 +9359,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
if (visitUnaryFloatCall(I, ISD::FATAN))
return;
break;
+ case LibFunc_atan2:
+ case LibFunc_atan2f:
+ case LibFunc_atan2l:
+ if (visitBinaryFloatCall(I, ISD::FATAN2))
+ return;
+ break;
case LibFunc_sinh:
case LibFunc_sinhf:
case LibFunc_sinhl:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 56fc538..703efb7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -227,6 +227,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::STRICT_FACOS: return "strict_facos";
case ISD::FATAN: return "fatan";
case ISD::STRICT_FATAN: return "strict_fatan";
+ case ISD::FATAN2: return "fatan2";
+ case ISD::STRICT_FATAN2: return "strict_fatan2";
case ISD::FSINH: return "fsinh";
case ISD::STRICT_FSINH: return "strict_fsinh";
case ISD::FCOSH: return "fcosh";
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index a192161..0ce305c 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -519,8 +519,7 @@ static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M,
if (SupportsSelectionDAGSP)
*SupportsSelectionDAGSP = true;
TLI->insertSSPDeclarations(*M);
- return B.CreateCall(
- Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard));
+ return B.CreateIntrinsic(Intrinsic::stackguard, {}, {});
}
/// Insert code into the entry block that stores the stack guard
@@ -541,8 +540,7 @@ static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc,
AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot");
Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP);
- B.CreateCall(Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackprotector),
- {GuardSlot, AI});
+ B.CreateIntrinsic(Intrinsic::stackprotector, {}, {GuardSlot, AI});
return SupportsSelectionDAGSP;
}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 1f49d60..7a28f78 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -783,7 +783,7 @@ void TargetLoweringBase::initActions() {
ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
- ISD::FCOSH, ISD::FSINH, ISD::FTANH},
+ ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2},
VT, Expand);
// Constrained floating-point operations default to expand.
@@ -842,7 +842,8 @@ void TargetLoweringBase::initActions() {
ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
ISD::FROUNDEVEN, ISD::FTAN, ISD::FACOS, ISD::FASIN,
- ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH},
+ ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH,
+ ISD::FATAN2},
{MVT::f32, MVT::f64, MVT::f128}, Expand);
// FIXME: Query RuntimeLibCalls to make the decision.
@@ -850,7 +851,7 @@ void TargetLoweringBase::initActions() {
{MVT::f32, MVT::f64, MVT::f128}, LibCall);
setOperationAction({ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, ISD::FCOSH,
- ISD::FSINH, ISD::FTANH},
+ ISD::FSINH, ISD::FTANH, ISD::FATAN2},
MVT::f16, Promote);
// Default ISD::TRAP to expand (which turns it into abort).
setOperationAction(ISD::TRAP, MVT::Other, Expand);
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index cf9d63d..02c3a85 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -305,7 +305,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
if (StandardID == &DeadMachineInstructionElimID)
return applyDisable(TargetID, DisableMachineDCE);
- if (StandardID == &EarlyIfConverterID)
+ if (StandardID == &EarlyIfConverterLegacyID)
return applyDisable(TargetID, DisableEarlyIfConversion);
if (StandardID == &EarlyMachineLICMID)
@@ -521,7 +521,7 @@ void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
DISABLE_PASS(DisableBlockPlacement, MachineBlockPlacementPass)
DISABLE_PASS(DisableBranchFold, BranchFolderPass)
DISABLE_PASS(DisableCopyProp, MachineCopyPropagationPass)
- DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterPass)
+ DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterLegacyPass)
DISABLE_PASS(DisableEarlyTailDup, EarlyTailDuplicatePass)
DISABLE_PASS(DisableMachineCSE, MachineCSELegacyPass)
DISABLE_PASS(DisableMachineDCE, DeadMachineInstructionElimPass)
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index c56ec19..401ed52 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -608,7 +608,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) {
using llvm::orc::shared::SPSExecutorAddr;
using llvm::orc::shared::SPSString;
using SPSDLOpenSig = SPSExecutorAddr(SPSString, int32_t);
- using SPSDLUpdateSig = int32_t(SPSExecutorAddr, int32_t);
+ using SPSDLUpdateSig = int32_t(SPSExecutorAddr);
enum dlopen_mode : int32_t {
ORC_RT_RTLD_LAZY = 0x1,
ORC_RT_RTLD_NOW = 0x2,
@@ -634,8 +634,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) {
if (dlupdate) {
int32_t result;
auto E = ES.callSPSWrapper<SPSDLUpdateSig>(WrapperAddr->getAddress(),
- result, DSOHandles[&JD],
- int32_t(ORC_RT_RTLD_LAZY));
+ result, DSOHandles[&JD]);
if (E)
return E;
else if (result)
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 32f66f7..519ff8d 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1745,8 +1745,7 @@ static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI,
if (!IndexForm)
std::swap(Args[0], Args[1]);
- Value *V = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args);
+ Value *V = Builder.CreateIntrinsic(IID, {}, Args);
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty)
: Builder.CreateBitCast(CI.getArgOperand(1),
Ty);
@@ -2269,8 +2268,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
SmallVector<Value *, 4> Args(CI.args());
Args.pop_back();
Args.pop_back();
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args);
+ Rep = Builder.CreateIntrinsic(IID, {}, Args);
unsigned NumArgs = CI.arg_size();
Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep,
CI.getArgOperand(NumArgs - 2));
@@ -2325,25 +2323,21 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
} else if (Name == "clz.ll") {
// llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
Value *Arg = CI->getArgOperand(0);
- Value *Ctlz = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz,
- {Arg->getType()}),
- {Arg, Builder.getFalse()}, "ctlz");
+ Value *Ctlz = Builder.CreateIntrinsic(Intrinsic::ctlz, {Arg->getType()},
+ {Arg, Builder.getFalse()},
+ /*FMFSource=*/nullptr, "ctlz");
Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
} else if (Name == "popc.ll") {
// llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
// i64.
Value *Arg = CI->getArgOperand(0);
- Value *Popc = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctpop,
- {Arg->getType()}),
- Arg, "ctpop");
+ Value *Popc = Builder.CreateIntrinsic(Intrinsic::ctpop, {Arg->getType()},
+ Arg, /*FMFSource=*/nullptr, "ctpop");
Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
} else if (Name == "h2f") {
- Rep = Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
- F->getParent(), Intrinsic::convert_from_fp16,
- {Builder.getFloatTy()}),
- CI->getArgOperand(0), "h2f");
+ Rep = Builder.CreateIntrinsic(Intrinsic::convert_from_fp16,
+ {Builder.getFloatTy()}, CI->getArgOperand(0),
+ /*FMFSource=*/nullptr, "h2f");
} else if (Name.consume_front("bitcast.") &&
(Name == "f2i" || Name == "i2f" || Name == "ll2d" ||
Name == "d2ll")) {
@@ -2493,10 +2487,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
} else if (Name.starts_with("avx.sqrt.p") ||
Name.starts_with("sse2.sqrt.p") ||
Name.starts_with("sse.sqrt.p")) {
- Rep =
- Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
- F->getParent(), Intrinsic::sqrt, CI->getType()),
- {CI->getArgOperand(0)});
+ Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(),
+ {CI->getArgOperand(0)});
} else if (Name.starts_with("avx512.mask.sqrt.p")) {
if (CI->arg_size() == 4 &&
(!isa<ConstantInt>(CI->getArgOperand(3)) ||
@@ -2505,13 +2497,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
: Intrinsic::x86_avx512_sqrt_pd_512;
Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)};
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+ Rep = Builder.CreateIntrinsic(IID, {}, Args);
} else {
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::sqrt,
- CI->getType()),
- {CI->getArgOperand(0)});
+ Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(),
+ {CI->getArgOperand(0)});
}
Rep =
emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
@@ -2635,9 +2624,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
break;
}
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
- {CI->getOperand(0), CI->getArgOperand(1)});
+ Rep = Builder.CreateIntrinsic(IID, {},
+ {CI->getOperand(0), CI->getArgOperand(1)});
Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
} else if (Name.starts_with("avx512.mask.fpclass.p")) {
Type *OpTy = CI->getArgOperand(0)->getType();
@@ -2659,9 +2647,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
else
llvm_unreachable("Unexpected intrinsic");
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
- {CI->getOperand(0), CI->getArgOperand(1)});
+ Rep = Builder.CreateIntrinsic(IID, {},
+ {CI->getOperand(0), CI->getArgOperand(1)});
Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
} else if (Name.starts_with("avx512.cmp.p")) {
SmallVector<Value *, 4> Args(CI->args());
@@ -2689,8 +2676,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
std::swap(Mask, Args.back());
Args.push_back(Mask);
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Args);
+ Rep = Builder.CreateIntrinsic(IID, {}, Args);
} else if (Name.starts_with("avx512.mask.cmp.")) {
// Integer compare intrinsics.
unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -3413,8 +3399,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
else
IID = Intrinsic::x86_avx512_add_pd_512;
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+ Rep = Builder.CreateIntrinsic(
+ IID, {},
{CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
} else {
Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3429,8 +3415,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
else
IID = Intrinsic::x86_avx512_div_pd_512;
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+ Rep = Builder.CreateIntrinsic(
+ IID, {},
{CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
} else {
Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3445,8 +3431,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
else
IID = Intrinsic::x86_avx512_mul_pd_512;
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+ Rep = Builder.CreateIntrinsic(
+ IID, {},
{CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
} else {
Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3461,8 +3447,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
else
IID = Intrinsic::x86_avx512_sub_pd_512;
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+ Rep = Builder.CreateIntrinsic(
+ IID, {},
{CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
} else {
Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3479,16 +3465,15 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
{Intrinsic::x86_avx512_min_ps_512, Intrinsic::x86_avx512_min_pd_512}};
Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble];
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+ Rep = Builder.CreateIntrinsic(
+ IID, {},
{CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
Rep =
emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
} else if (Name.starts_with("avx512.mask.lzcnt.")) {
Rep =
- Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
- F->getParent(), Intrinsic::ctlz, CI->getType()),
- {CI->getArgOperand(0), Builder.getInt1(false)});
+ Builder.CreateIntrinsic(Intrinsic::ctlz, CI->getType(),
+ {CI->getArgOperand(0), Builder.getInt1(false)});
Rep =
emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
} else if (Name.starts_with("avx512.mask.psll")) {
@@ -3732,10 +3717,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
if (NegAcc)
Ops[2] = Builder.CreateFNeg(Ops[2]);
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma,
- Ops[0]->getType()),
- Ops);
+ Rep = Builder.CreateIntrinsic(Intrinsic::fma, Ops[0]->getType(), Ops);
if (IsScalar)
Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
@@ -3747,10 +3729,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma,
- Ops[0]->getType()),
- Ops);
+ Rep = Builder.CreateIntrinsic(Intrinsic::fma, Ops[0]->getType(), Ops);
Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()),
Rep, (uint64_t)0);
@@ -3846,9 +3825,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
else
IID = Intrinsic::x86_avx512_vfmadd_pd_512;
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
- {A, B, C, CI->getArgOperand(4)});
+ Rep = Builder.CreateIntrinsic(IID, {}, {A, B, C, CI->getArgOperand(4)});
} else {
Function *FMA = Intrinsic::getOrInsertDeclaration(
CI->getModule(), Intrinsic::fma, A->getType());
@@ -3878,8 +3855,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2)};
Ops[2] = Builder.CreateFNeg(Ops[2]);
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops);
+ Rep = Builder.CreateIntrinsic(IID, {}, Ops);
} else if (Name.starts_with("avx512.mask.vfmaddsub.p") ||
Name.starts_with("avx512.mask3.vfmaddsub.p") ||
Name.starts_with("avx512.maskz.vfmaddsub.p") ||
@@ -3902,8 +3878,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
if (IsSubAdd)
Ops[2] = Builder.CreateFNeg(Ops[2]);
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops);
+ Rep = Builder.CreateIntrinsic(IID, {}, Ops);
} else {
int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
@@ -3954,8 +3929,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2), CI->getArgOperand(3)};
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+ Rep = Builder.CreateIntrinsic(IID, {}, Args);
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
: CI->getArgOperand(0);
Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
@@ -3982,8 +3956,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2)};
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+ Rep = Builder.CreateIntrinsic(IID, {}, Args);
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
: CI->getArgOperand(0);
Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4018,8 +3991,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2)};
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+ Rep = Builder.CreateIntrinsic(IID, {}, Args);
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
: CI->getArgOperand(0);
Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4048,8 +4020,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2)};
- Rep = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+ Rep = Builder.CreateIntrinsic(IID, {}, Args);
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
: CI->getArgOperand(0);
Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4071,8 +4042,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
// Make a call with 3 operands.
Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2)};
- Value *NewCall = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+ Value *NewCall = Builder.CreateIntrinsic(IID, {}, Args);
// Extract the second result and store it.
Value *Data = Builder.CreateExtractValue(NewCall, 1);
@@ -4127,20 +4097,15 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
if (Name == "mve.vctp64.old") {
// Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the
// correct type.
- Value *VCTP =
- Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
- F->getParent(), Intrinsic::arm_mve_vctp64),
- CI->getArgOperand(0), CI->getName());
- Value *C1 = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(
- F->getParent(), Intrinsic::arm_mve_pred_v2i,
- {VectorType::get(Builder.getInt1Ty(), 2, false)}),
- VCTP);
- return Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(
- F->getParent(), Intrinsic::arm_mve_pred_i2v,
- {VectorType::get(Builder.getInt1Ty(), 4, false)}),
- C1);
+ Value *VCTP = Builder.CreateIntrinsic(Intrinsic::arm_mve_vctp64, {},
+ CI->getArgOperand(0),
+ /*FMFSource=*/nullptr, CI->getName());
+ Value *C1 = Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_pred_v2i,
+ {VectorType::get(Builder.getInt1Ty(), 2, false)}, VCTP);
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_pred_i2v,
+ {VectorType::get(Builder.getInt1Ty(), 4, false)}, C1);
} else if (Name == "mve.mull.int.predicated.v2i64.v4i32.v4i1" ||
Name == "mve.vqdmull.predicated.v2i64.v4i32.v4i1" ||
Name == "mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" ||
@@ -4198,15 +4163,10 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
for (Value *Op : CI->args()) {
Type *Ty = Op->getType();
if (Ty->getScalarSizeInBits() == 1) {
- Value *C1 = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(
- F->getParent(), Intrinsic::arm_mve_pred_v2i,
- {VectorType::get(Builder.getInt1Ty(), 4, false)}),
- Op);
- Op = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(
- F->getParent(), Intrinsic::arm_mve_pred_i2v, {V2I1Ty}),
- C1);
+ Value *C1 = Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_pred_v2i,
+ {VectorType::get(Builder.getInt1Ty(), 4, false)}, Op);
+ Op = Builder.CreateIntrinsic(Intrinsic::arm_mve_pred_i2v, {V2I1Ty}, C1);
}
Ops.push_back(Op);
}
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index ff8b4b7..1b92daf 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -724,6 +724,16 @@ Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id,
.getCallee());
}
+Function *Intrinsic::getDeclarationIfExists(const Module *M, ID id) {
+ return M->getFunction(getName(id));
+}
+
+Function *Intrinsic::getDeclarationIfExists(Module *M, ID id,
+ ArrayRef<Type *> Tys,
+ FunctionType *FT) {
+ return M->getFunction(getName(id, Tys, M, FT));
+}
+
// This defines the "Intrinsic::getIntrinsicForClangBuiltin()" method.
#define GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
#include "llvm/IR/IntrinsicImpl.inc"
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 96e2f1d..ce6f6c73 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -104,15 +104,13 @@ void PMDataManager::emitInstrCountChangedRemark(
[&FunctionToInstrCount](Function &MaybeChangedFn) {
// Update the total module count.
unsigned FnSize = MaybeChangedFn.getInstructionCount();
- auto It = FunctionToInstrCount.find(MaybeChangedFn.getName());
// If we created a new function, then we need to add it to the map and
// say that it changed from 0 instructions to FnSize.
- if (It == FunctionToInstrCount.end()) {
- FunctionToInstrCount[MaybeChangedFn.getName()] =
- std::pair<unsigned, unsigned>(0, FnSize);
+ auto [It, Inserted] = FunctionToInstrCount.try_emplace(
+ MaybeChangedFn.getName(), 0, FnSize);
+ if (Inserted)
return;
- }
// Insert the new function size into the second member of the pair. This
// tells us whether or not this function changed in size.
It->second.second = FnSize;
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d806f80..0616755 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -49,6 +49,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
setLibcallName(RTLIB::ASIN_F128, "asinf128");
setLibcallName(RTLIB::ACOS_F128, "acosf128");
setLibcallName(RTLIB::ATAN_F128, "atanf128");
+ setLibcallName(RTLIB::ATAN2_F128, "atan2f128");
setLibcallName(RTLIB::SINH_F128, "sinhf128");
setLibcallName(RTLIB::COSH_F128, "coshf128");
setLibcallName(RTLIB::TANH_F128, "tanhf128");
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 90c4e2c..0f53c60 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1120,13 +1120,13 @@ Error LTO::checkPartiallySplit() {
if (!ThinLTO.CombinedIndex.partiallySplitLTOUnits())
return Error::success();
- Function *TypeTestFunc = RegularLTO.CombinedModule->getFunction(
- Intrinsic::getName(Intrinsic::type_test));
- Function *TypeCheckedLoadFunc = RegularLTO.CombinedModule->getFunction(
- Intrinsic::getName(Intrinsic::type_checked_load));
- Function *TypeCheckedLoadRelativeFunc =
- RegularLTO.CombinedModule->getFunction(
- Intrinsic::getName(Intrinsic::type_checked_load_relative));
+ const Module *Combined = RegularLTO.CombinedModule.get();
+ Function *TypeTestFunc =
+ Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_test);
+ Function *TypeCheckedLoadFunc =
+ Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_checked_load);
+ Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+ Combined, Intrinsic::type_checked_load_relative);
// First check if there are type tests / type checked loads in the
// merged regular LTO module IR.
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 36c0cea..ebad350 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -82,6 +82,7 @@
#include "llvm/CodeGen/CodeGenPrepare.h"
#include "llvm/CodeGen/DeadMachineInstructionElim.h"
#include "llvm/CodeGen/DwarfEHPrepare.h"
+#include "llvm/CodeGen/EarlyIfConversion.h"
#include "llvm/CodeGen/ExpandLargeDivRem.h"
#include "llvm/CodeGen/ExpandLargeFpConvert.h"
#include "llvm/CodeGen/ExpandMemCmp.h"
@@ -109,6 +110,7 @@
#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
#include "llvm/CodeGen/MachineVerifier.h"
#include "llvm/CodeGen/PHIElimination.h"
#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 461fc43d..8881bff 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -18,14 +18,12 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/Wasm.h"
#include "llvm/Object/Archive.h"
#include "llvm/Object/Binary.h"
#include "llvm/Object/COFF.h"
#include "llvm/Object/Error.h"
#include "llvm/Object/MachOUniversal.h"
#include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/Wasm.h"
#include "llvm/ProfileData/InstrProf.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compression.h"
@@ -1079,53 +1077,6 @@ lookupSections(ObjectFile &OF, InstrProfSectKind IPSK) {
return Sections;
}
-/// Find a section that matches \p Name and is allocatable at runtime.
-///
-/// Returns the contents of the section and its start offset in the object file.
-static Expected<std::pair<StringRef, uint64_t>>
-lookupAllocatableSection(ObjectFile &OF, InstrProfSectKind IPSK) {
- // On Wasm, allocatable sections can live only in data segments.
- if (auto *WOF = dyn_cast<WasmObjectFile>(&OF)) {
- std::vector<const WasmSegment *> Segments;
- auto ObjFormat = OF.getTripleObjectFormat();
- auto Name =
- getInstrProfSectionName(IPSK, ObjFormat, /*AddSegmentInfo=*/false);
- for (const auto &DebugName : WOF->debugNames()) {
- if (DebugName.Type != wasm::NameType::DATA_SEGMENT ||
- DebugName.Name != Name)
- continue;
- if (DebugName.Index >= WOF->dataSegments().size())
- return make_error<CoverageMapError>(coveragemap_error::malformed);
- auto &Segment = WOF->dataSegments()[DebugName.Index];
- Segments.push_back(&Segment);
- }
- if (Segments.empty())
- return make_error<CoverageMapError>(coveragemap_error::no_data_found);
- if (Segments.size() != 1)
- return make_error<CoverageMapError>(coveragemap_error::malformed);
-
- const auto &Segment = *Segments.front();
- auto &Data = Segment.Data;
- StringRef Content(reinterpret_cast<const char *>(Data.Content.data()),
- Data.Content.size());
- return std::make_pair(Content, Segment.SectionOffset);
- }
-
- // On other object file types, delegate to lookupSections to find the section.
- auto Sections = lookupSections(OF, IPSK);
- if (!Sections)
- return Sections.takeError();
- if (Sections->size() != 1)
- return make_error<CoverageMapError>(
- coveragemap_error::malformed,
- "the size of coverage mapping section is not one");
- auto &Section = Sections->front();
- auto ContentsOrErr = Section.getContents();
- if (!ContentsOrErr)
- return ContentsOrErr.takeError();
- return std::make_pair(*ContentsOrErr, Section.getAddress());
-}
-
static Expected<std::unique_ptr<BinaryCoverageReader>>
loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch,
StringRef CompilationDir = "",
@@ -1156,20 +1107,23 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch,
// Look for the sections that we are interested in.
auto ProfileNames = std::make_unique<InstrProfSymtab>();
+ std::vector<SectionRef> NamesSectionRefs;
// If IPSK_name is not found, fallback to search for IPK_covname, which is
// used when binary correlation is enabled.
- auto NamesSection = lookupAllocatableSection(*OF, IPSK_name);
+ auto NamesSection = lookupSections(*OF, IPSK_name);
if (auto E = NamesSection.takeError()) {
consumeError(std::move(E));
- NamesSection = lookupAllocatableSection(*OF, IPSK_covname);
+ NamesSection = lookupSections(*OF, IPSK_covname);
if (auto E = NamesSection.takeError())
return std::move(E);
}
+ NamesSectionRefs = *NamesSection;
- uint64_t NamesAddress;
- StringRef NamesContent;
- std::tie(NamesContent, NamesAddress) = *NamesSection;
- if (Error E = ProfileNames->create(NamesContent, NamesAddress))
+ if (NamesSectionRefs.size() != 1)
+ return make_error<CoverageMapError>(
+ coveragemap_error::malformed,
+ "the size of coverage mapping section is not one");
+ if (Error E = ProfileNames->create(NamesSectionRefs.back()))
return std::move(E);
auto CoverageSection = lookupSections(*OF, IPSK_covmap);
diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index 7eb1088..f3e8d0a 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -64,11 +64,10 @@ static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
AlignStyle Where = AlignStyle::Right;
StringRef Options;
unsigned Index = ~0U;
- RepString = RepString.trim();
+ RepString = RepString.ltrim();
// If index is not specified, keep it ~0U to indicate unresolved index.
RepString.consumeInteger(0, Index);
- RepString = RepString.trim();
if (RepString.consume_front(",")) {
if (!consumeFieldLayout(RepString, Where, Align, Pad)) {
@@ -76,9 +75,9 @@ static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
return std::nullopt;
}
}
- RepString = RepString.trim();
+ RepString = RepString.ltrim();
if (RepString.consume_front(":")) {
- Options = RepString.trim();
+ Options = RepString;
RepString = StringRef();
}
RepString = RepString.trim();
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 9669a39..0301032 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -795,7 +795,7 @@ INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
"AArch64 CCMP Pass", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
"AArch64 CCMP Pass", false, false)
@@ -809,8 +809,8 @@ void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfoWrapperPass>();
AU.addPreserved<MachineLoopInfoWrapperPass>();
- AU.addRequired<MachineTraceMetrics>();
- AU.addPreserved<MachineTraceMetrics>();
+ AU.addRequired<MachineTraceMetricsWrapperPass>();
+ AU.addPreserved<MachineTraceMetricsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -937,7 +937,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
- Traces = &getAnalysis<MachineTraceMetrics>();
+ Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
MinInstr = nullptr;
MinSize = MF.getFunction().hasMinSize();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ed06d8a..60150c3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27283,9 +27283,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
IRBuilderBase &Builder) const {
- Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_clrex));
+ Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {});
}
Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 325508b..32f2c7c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6237,7 +6237,8 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONandIsStreamingSafe] in {
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON] in {
def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
(SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -6247,7 +6248,8 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
(UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON, HasFullFP16] in {
def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
(SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
@@ -6270,9 +6272,10 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
- (v8i16 FPR128:$Rn), (i64 0))), i16)))),
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON, HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
+ (v8i16 FPR128:$Rn), (i64 0))), i16)))),
(SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
// unsigned 32-bit extracted element is truncated to 16-bits using AND
@@ -6367,7 +6370,7 @@ def : Pat <(f64 (uint_to_fp (i32
(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
-} // let Predicates = [HasNEONandIsStreamingSafe]
+} // let Predicates = [HasNEON]
//===----------------------------------------------------------------------===//
// Advanced SIMD three different-sized vector instructions.
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 72f110c..85b9733 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -303,7 +303,7 @@ public:
void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; }
uint64_t getLocalStackSize() const { return LocalStackSize; }
- void setOutliningStyle(std::string Style) { OutliningStyle = Style; }
+ void setOutliningStyle(const std::string &Style) { OutliningStyle = Style; }
std::optional<std::string> getOutliningStyle() const {
return OutliningStyle;
}
diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 047e382..d8c8b17 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -53,8 +53,8 @@ private:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineTraceMetrics>();
- AU.addPreserved<MachineTraceMetrics>();
+ AU.addRequired<MachineTraceMetricsWrapperPass>();
+ AU.addPreserved<MachineTraceMetricsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -139,7 +139,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
TRI = ST.getRegisterInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
- Traces = &getAnalysis<MachineTraceMetrics>();
+ Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
MinInstr = nullptr;
LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 21b86f5..c7bd039 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -784,7 +784,7 @@ bool AArch64PassConfig::addILPOpts() {
if (EnableCondBrTuning)
addPass(createAArch64CondBrTuning());
if (EnableEarlyIfConversion)
- addPass(&EarlyIfConverterID);
+ addPass(&EarlyIfConverterLegacyID);
if (EnableStPairSuppress)
addPass(createAArch64StorePairSuppressPass());
addPass(createAArch64SIMDInstrOptPass());
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index a698948..e9d0160 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -215,19 +215,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.legalFor({s64, v8s16, v16s8, v4s32})
.lower();
- auto &MinMaxActions = getActionDefinitionsBuilder(
- {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
- if (HasCSSC)
- MinMaxActions
- .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
- // Making clamping conditional on CSSC extension as without legal types we
- // lower to CMP which can fold one of the two sxtb's we'd otherwise need
- // if we detect a type smaller than 32-bit.
- .minScalar(0, s32);
- else
- MinMaxActions
- .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
- MinMaxActions
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+ .legalFor(HasCSSC, {s32, s64})
+ .minScalar(HasCSSC, 0, s32)
.clampNumElements(0, v8s8, v16s8)
.clampNumElements(0, v4s16, v8s16)
.clampNumElements(0, v2s32, v4s32)
@@ -247,11 +238,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM,
G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
- .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
- .legalIf([=](const LegalityQuery &Query) {
- const auto &Ty = Query.Types[0];
- return (Ty == v8s16 || Ty == v4s16) && HasFP16;
- })
+ .legalFor({s32, s64, v2s32, v4s32, v2s64})
+ .legalFor(HasFP16, {s16, v4s16, v8s16})
.libcallFor({s128})
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
.minScalarOrElt(0, MinFPScalar)
@@ -261,11 +249,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.moreElementsToNextPow2(0);
getActionDefinitionsBuilder({G_FABS, G_FNEG})
- .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
- .legalIf([=](const LegalityQuery &Query) {
- const auto &Ty = Query.Types[0];
- return (Ty == v8s16 || Ty == v4s16) && HasFP16;
- })
+ .legalFor({s32, s64, v2s32, v4s32, v2s64})
+ .legalFor(HasFP16, {s16, v4s16, v8s16})
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
.lowerIf(scalarOrEltWiderThan(0, 64))
.clampNumElements(0, v4s16, v8s16)
@@ -350,31 +335,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
};
- auto &LoadActions = getActionDefinitionsBuilder(G_LOAD);
- auto &StoreActions = getActionDefinitionsBuilder(G_STORE);
-
- if (ST.hasSVE()) {
- LoadActions.legalForTypesWithMemDesc({
- // 128 bit base sizes
- {nxv16s8, p0, nxv16s8, 8},
- {nxv8s16, p0, nxv8s16, 8},
- {nxv4s32, p0, nxv4s32, 8},
- {nxv2s64, p0, nxv2s64, 8},
- });
-
- // TODO: Add nxv2p0. Consider bitcastIf.
- // See #92130
- // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
- StoreActions.legalForTypesWithMemDesc({
- // 128 bit base sizes
- {nxv16s8, p0, nxv16s8, 8},
- {nxv8s16, p0, nxv8s16, 8},
- {nxv4s32, p0, nxv4s32, 8},
- {nxv2s64, p0, nxv2s64, 8},
- });
- }
-
- LoadActions
+ getActionDefinitionsBuilder(G_LOAD)
.customIf([=](const LegalityQuery &Query) {
return HasRCPC3 && Query.Types[0] == s128 &&
Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
@@ -399,6 +360,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// These extends are also legal
.legalForTypesWithMemDesc(
{{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
+ .legalForTypesWithMemDesc({
+ // SVE vscale x 128 bit base sizes
+ {nxv16s8, p0, nxv16s8, 8},
+ {nxv8s16, p0, nxv8s16, 8},
+ {nxv4s32, p0, nxv4s32, 8},
+ {nxv2s64, p0, nxv2s64, 8},
+ })
.widenScalarToNextPow2(0, /* MinSize = */ 8)
.clampMaxNumElements(0, s8, 16)
.clampMaxNumElements(0, s16, 8)
@@ -425,7 +393,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
- StoreActions
+ getActionDefinitionsBuilder(G_STORE)
.customIf([=](const LegalityQuery &Query) {
return HasRCPC3 && Query.Types[0] == s128 &&
Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
@@ -445,6 +413,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8},
{v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8},
{v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
+ .legalForTypesWithMemDesc({
+ // SVE vscale x 128 bit base sizes
+ // TODO: Add nxv2p0. Consider bitcastIf.
+ // See #92130
+ // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
+ {nxv16s8, p0, nxv16s8, 8},
+ {nxv8s16, p0, nxv8s16, 8},
+ {nxv4s32, p0, nxv4s32, 8},
+ {nxv2s64, p0, nxv2s64, 8},
+ })
.clampScalar(0, s8, s64)
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].isScalar() &&
@@ -532,12 +510,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(0)
.clampScalar(0, s8, s64);
getActionDefinitionsBuilder(G_FCONSTANT)
- .legalIf([=](const LegalityQuery &Query) {
- const auto &Ty = Query.Types[0];
- if (HasFP16 && Ty == s16)
- return true;
- return Ty == s32 || Ty == s64 || Ty == s128;
- })
+ .legalFor({s32, s64, s128})
+ .legalFor(HasFP16, {s16})
.clampScalar(0, MinFPScalar, s128);
// FIXME: fix moreElementsToNextPow2
@@ -569,16 +543,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.customIf(isVector(0));
getActionDefinitionsBuilder(G_FCMP)
- .legalFor({{s32, MinFPScalar},
- {s32, s32},
+ .legalFor({{s32, s32},
{s32, s64},
{v4s32, v4s32},
{v2s32, v2s32},
{v2s64, v2s64}})
- .legalIf([=](const LegalityQuery &Query) {
- const auto &Ty = Query.Types[1];
- return (Ty == v8s16 || Ty == v4s16) && Ty == Query.Types[0] && HasFP16;
- })
+ .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
.widenScalarOrEltToNextPow2(1)
.clampScalar(0, s32, s32)
.minScalarOrElt(1, MinFPScalar)
@@ -693,13 +663,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{v2s64, v2s64},
{v4s32, v4s32},
{v2s32, v2s32}})
- .legalIf([=](const LegalityQuery &Query) {
- return HasFP16 &&
- (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
- Query.Types[1] == v8s16) &&
- (Query.Types[0] == s32 || Query.Types[0] == s64 ||
- Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
- })
+ .legalFor(HasFP16,
+ {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
.scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
// The range of a fp16 value fits into an i17, so we can lower the width
@@ -741,13 +706,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{v2s64, v2s64},
{v4s32, v4s32},
{v2s32, v2s32}})
- .legalIf([=](const LegalityQuery &Query) {
- return HasFP16 &&
- (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
- Query.Types[1] == v8s16) &&
- (Query.Types[0] == s32 || Query.Types[0] == s64 ||
- Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
- })
+ .legalFor(HasFP16,
+ {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
// Handle types larger than i64 by scalarizing/lowering.
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
.scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
@@ -790,13 +750,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{v2s64, v2s64},
{v4s32, v4s32},
{v2s32, v2s32}})
- .legalIf([=](const LegalityQuery &Query) {
- return HasFP16 &&
- (Query.Types[0] == s16 || Query.Types[0] == v4s16 ||
- Query.Types[0] == v8s16) &&
- (Query.Types[1] == s32 || Query.Types[1] == s64 ||
- Query.Types[1] == v4s16 || Query.Types[1] == v8s16);
- })
+ .legalFor(HasFP16,
+ {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}})
.scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
.moreElementsToNextPow2(1)
@@ -893,29 +848,21 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.lowerIf(
all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
- LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
- return ST.outlineAtomics() && !ST.hasLSE();
- };
+ bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE();
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
- .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
- predNot(UseOutlineAtomics)))
- .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
- .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
- return Query.Types[0].getSizeInBits() == 128 &&
- !UseOutlineAtomics(Query);
- })
- .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
- UseOutlineAtomics))
+ .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
+ .customFor(!UseOutlineAtomics, {{s128, p0}})
+ .libcallFor(UseOutlineAtomics,
+ {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}})
.clampScalar(0, s32, s64);
getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
G_ATOMICRMW_XOR})
- .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
- predNot(UseOutlineAtomics)))
- .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
- UseOutlineAtomics))
+ .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
+ .libcallFor(UseOutlineAtomics,
+ {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
.clampScalar(0, s32, s64);
// Do not outline these atomics operations, as per comment in
@@ -1050,12 +997,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(1, /*Min=*/32)
.clampScalar(1, s32, s64)
.scalarSameSizeAs(0, 1)
- .legalIf([=](const LegalityQuery &Query) {
- return (HasCSSC && typeInSet(0, {s32, s64})(Query));
- })
- .customIf([=](const LegalityQuery &Query) {
- return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
- });
+ .legalFor(HasCSSC, {s32, s64})
+ .customFor(!HasCSSC, {s32, s64});
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
.legalIf([=](const LegalityQuery &Query) {
@@ -1143,11 +1086,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
}
// FIXME: Legal vector types are only legal with NEON.
- auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
- if (HasCSSC)
- ABSActions
- .legalFor({s32, s64});
- ABSActions.legalFor(PackedVectorAllTypeList)
+ getActionDefinitionsBuilder(G_ABS)
+ .legalFor(HasCSSC, {s32, s64})
+ .legalFor(PackedVectorAllTypeList)
.customIf([=](const LegalityQuery &Q) {
// TODO: Fix suboptimal codegen for 128+ bit types.
LLT SrcTy = Q.Types[0];
@@ -1171,10 +1112,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// later.
getActionDefinitionsBuilder(G_VECREDUCE_FADD)
.legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
- .legalIf([=](const LegalityQuery &Query) {
- const auto &Ty = Query.Types[1];
- return (Ty == v4s16 || Ty == v8s16) && HasFP16;
- })
+ .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
.minScalarOrElt(0, MinFPScalar)
.clampMaxNumElements(1, s64, 2)
.clampMaxNumElements(1, s32, 4)
@@ -1215,10 +1153,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
.legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
- .legalIf([=](const LegalityQuery &Query) {
- const auto &Ty = Query.Types[1];
- return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16;
- })
+ .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
.minScalarOrElt(0, MinFPScalar)
.clampMaxNumElements(1, s64, 2)
.clampMaxNumElements(1, s32, 4)
@@ -1295,32 +1230,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.customFor({{s32, s32}, {s64, s64}});
auto always = [=](const LegalityQuery &Q) { return true; };
- auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
- if (HasCSSC)
- CTPOPActions
- .legalFor({{s32, s32},
- {s64, s64},
- {v8s8, v8s8},
- {v16s8, v16s8}})
- .customFor({{s128, s128},
- {v2s64, v2s64},
- {v2s32, v2s32},
- {v4s32, v4s32},
- {v4s16, v4s16},
- {v8s16, v8s16}});
- else
- CTPOPActions
- .legalFor({{v8s8, v8s8},
- {v16s8, v16s8}})
- .customFor({{s32, s32},
- {s64, s64},
- {s128, s128},
- {v2s64, v2s64},
- {v2s32, v2s32},
- {v4s32, v4s32},
- {v4s16, v4s16},
- {v8s16, v8s16}});
- CTPOPActions
+ getActionDefinitionsBuilder(G_CTPOP)
+ .legalFor(HasCSSC, {{s32, s32}, {s64, s64}})
+ .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
+ .customFor(!HasCSSC, {{s32, s32}, {s64, s64}})
+ .customFor({{s128, s128},
+ {v2s64, v2s64},
+ {v2s32, v2s32},
+ {v4s32, v4s32},
+ {v4s16, v4s16},
+ {v8s16, v8s16}})
.clampScalar(0, s32, s128)
.widenScalarToNextPow2(0)
.minScalarEltSameAsIf(always, 1, 0)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f93..985fa8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -169,5 +169,6 @@ def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
- fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+ fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+ redundant_and]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index d16c96f..6573176 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -171,8 +171,8 @@ public:
// Try to allocate SGPRs to preload implicit kernel arguments.
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
IRBuilder<> &Builder) {
- StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
- Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
+ Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
if (!ImplicitArgPtr)
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 7d66d07..1bb5e79 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -78,8 +78,7 @@ public:
Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
: Intrinsic::amdgcn_dispatch_ptr;
- StringRef Name = Intrinsic::getName(IntrinsicId);
- return M.getFunction(Name);
+ return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
}
} // end anonymous namespace
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index cfce56f..51af16c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -921,9 +921,8 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
StringRef("__asan_free_impl"),
FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
- Value *ReturnAddr = IRB.CreateCall(
- Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress),
- IRB.getInt32(0));
+ Value *ReturnAddr =
+ IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 23ee0c3..e4cc522 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1335,7 +1335,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
bool GCNPassConfig::addILPOpts() {
if (EnableEarlyIfConversion)
- addPass(&EarlyIfConverterID);
+ addPass(&EarlyIfConverterLegacyID);
TargetPassConfig::addILPOpts();
return false;
@@ -1983,6 +1983,13 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>());
}
+void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
+ if (EnableEarlyIfConversion)
+ addPass(EarlyIfConverterPass());
+
+ Base::addILPOpts(addPass);
+}
+
void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
CreateMCStreamer) const {
// TODO: Add AsmPrinter.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index af8476b..d8a5111 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -172,6 +172,7 @@ public:
void addIRPasses(AddIRPass &) const;
void addCodeGenPrepare(AddIRPass &) const;
void addPreISel(AddIRPass &addPass) const;
+ void addILPOpts(AddMachinePass &) const;
void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
Error addInstSelector(AddMachinePass &) const;
void addMachineSSAOptimization(AddMachinePass &) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8c197f2..de9173e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8786,7 +8786,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
const Module *M = MF.getFunction().getParent();
const GlobalValue *GV =
- M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+ Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
SIInstrInfo::MO_ABS32_LO);
return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d676d56..abd6c7c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7366,14 +7366,25 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
const DebugLoc &DL = Inst.getDebugLoc();
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
- .addImm(16)
- .add(Inst.getOperand(1));
- BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
- .addImm(0) // src0_modifiers
- .addReg(TmpReg)
- .addImm(0) // clamp
- .addImm(0); // omod
+ if (ST.useRealTrue16Insts()) {
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
+ .add(Inst.getOperand(1));
+ BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .addReg(TmpReg, 0, AMDGPU::hi16)
+ .addImm(0) // clamp
+ .addImm(0) // omod
+ .addImm(0); // op_sel0
+ } else {
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+ .addImm(16)
+ .add(Inst.getOperand(1));
+ BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .addReg(TmpReg)
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ }
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 087ca1f..42a1ffb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2149,6 +2149,8 @@ class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
string ret = dst#", "#src0#src1#src2#opsel#mods#clamp;
}
+// FIXME-TRUE16 AsmVOP3OpSel will be deprecated after all
+// VOP3 16 bit instructions are replaced to true16 format
class getAsmVOP3OpSel <int NumSrcArgs,
bit HasClamp,
bit HasOMod,
@@ -2237,8 +2239,9 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
string clamp = !if(HasClamp, "$clamp", "");
string omod = !if(HasOMod, "$omod", "");
- string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod, "");
-
+ string ret = dst#!if(!eq(NumSrcArgs,0),
+ "",
+ !if(HasDst,", ", "")#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod);
}
class getAsmVOP3DPP<string base> {
@@ -2733,6 +2736,7 @@ def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>;
def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
+def VOP_I32_I32_I32_I16 : VOPProfile <[i32, i32, i32, i16]>;
def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8073aca..faa0b6d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1094,7 +1094,7 @@ def : Pat <
// VOP1 Patterns
//===----------------------------------------------------------------------===//
-multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
+multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
// f16_to_fp patterns
def : GCNPat <
(f32 (any_f16_to_fp i32:$src0)),
@@ -1121,25 +1121,42 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
(cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
>;
+ // fp_to_fp16 patterns
def : GCNPat <
- (f64 (any_fpextend f16:$src)),
- (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
+ (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
>;
- // fp_to_fp16 patterns
+ // This is only used on targets without half support
+ // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
def : GCNPat <
- (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
>;
+}
+
+let True16Predicate = NotHasTrue16BitInsts in
+defm : f16_to_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
+
+let True16Predicate = UseFakeTrue16Insts in
+defm : f16_to_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
+
+multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64,
+ Instruction cvt_f32_f16_inst_e64,
+ RegOrImmOperand VSrc> {
+ def : GCNPat <
+ (f64 (any_fpextend f16:$src)),
+ (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
+ >;
def : GCNPat <
(i32 (fp_to_sint f16:$src)),
- (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
+ (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
>;
def : GCNPat <
(i32 (fp_to_uint f16:$src)),
- (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
+ (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
>;
def : GCNPat <
@@ -1151,20 +1168,16 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
(f16 (uint_to_fp i32:$src)),
(cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
>;
-
- // This is only used on targets without half support
- // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
- def : GCNPat <
- (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
- (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
- >;
}
let True16Predicate = NotHasTrue16BitInsts in
-defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
+defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64, VSrc_b32>;
+
+let True16Predicate = UseRealTrue16Insts in
+defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64, VSrcT_b16>;
let True16Predicate = UseFakeTrue16Insts in
-defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
+defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64, VSrc_b16>;
//===----------------------------------------------------------------------===//
// VOP2 Patterns
@@ -2774,16 +2787,27 @@ def : GCNPat <
SSrc_i1:$src))
>;
-let SubtargetPredicate = HasTrue16BitInsts in
+let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(f16 (sint_to_fp i1:$src)),
- (V_CVT_F16_F32_fake16_e32 (
- V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
- SSrc_i1:$src))
+ SSrc_i1:$src),
+ /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
>;
-let SubtargetPredicate = NotHasTrue16BitInsts in
+let True16Predicate = UseFakeTrue16Insts in
+def : GCNPat <
+ (f16 (sint_to_fp i1:$src)),
+ (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+ SSrc_i1:$src),
+ /*clamp*/ 0, /*omod*/ 0)
+>;
+
+let True16Predicate = NotHasTrue16BitInsts in
def : GCNPat <
(f16 (uint_to_fp i1:$src)),
(V_CVT_F16_F32_e32 (
@@ -2791,13 +2815,25 @@ def : GCNPat <
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
SSrc_i1:$src))
>;
-let SubtargetPredicate = HasTrue16BitInsts in
+
+let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(f16 (uint_to_fp i1:$src)),
- (V_CVT_F16_F32_fake16_e32 (
- V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
- SSrc_i1:$src))
+ SSrc_i1:$src),
+ /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
+>;
+
+let True16Predicate = UseFakeTrue16Insts in
+def : GCNPat <
+ (f16 (uint_to_fp i1:$src)),
+ (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+ SSrc_i1:$src),
+ /*clamp*/ 0, /*omod*/ 0)
>;
def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index be98d20..701aeda 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -503,7 +503,7 @@ let FPDPRounding = 1 in {
defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
} // End FPDPRounding = 1
-let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
+let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
def : GCNPat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_e32 $src)
@@ -513,7 +513,7 @@ def : GCNPat<
(V_CVT_F16_F32_e32 $src)
>;
}
-let OtherPredicates = [HasTrue16BitInsts] in {
+let True16Predicate = UseRealTrue16Insts in {
def : GCNPat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_t16_e32 $src)
@@ -523,6 +523,16 @@ def : GCNPat<
(V_CVT_F16_F32_t16_e32 $src)
>;
}
+let True16Predicate = UseFakeTrue16Insts in {
+def : GCNPat<
+ (f32 (f16_to_fp i16:$src)),
+ (V_CVT_F32_F16_fake16_e32 $src)
+>;
+def : GCNPat<
+ (i16 (AMDGPUfp_to_f16 f32:$src)),
+ (V_CVT_F16_F32_fake16_e32 $src)
+>;
+}
def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1);
@@ -1417,15 +1427,14 @@ def : GCNPat <
} // End OtherPredicates = [isGFX8Plus, p]
-let OtherPredicates = [UseFakeTrue16Insts] in {
+let True16Predicate = UseFakeTrue16Insts in {
def : GCNPat<
(i32 (DivergentUnaryFrag<anyext> i16:$src)),
(COPY $src)
>;
-} // End OtherPredicates = [UseFakeTrue16Insts]
-
+} // End True16Predicate = UseFakeTrue16Insts
-let OtherPredicates = [UseRealTrue16Insts] in {
+let True16Predicate = UseRealTrue16Insts in {
def : GCNPat<
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
(COPY $src)
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 639f918..e83ea57 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1664,8 +1664,8 @@ multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> :
- VOP3Only_Realtriple_t16<GFX11Gen, op, asmName, OpName>,
- VOP3Only_Realtriple_t16<GFX12Gen, op, asmName, OpName>;
+ VOP3_Realtriple_t16_gfx11<op, asmName, OpName, "", /*IsSingle*/1>,
+ VOP3_Realtriple_t16_gfx12<op, asmName, OpName, "", /*IsSingle*/1>;
multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> {
defm OpName#"_t16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_t16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 78ca7a2..34ecdb5 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -569,16 +569,10 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
getAsmVOP3OpSel<3, HasClamp, HasOMod,
HasSrc0FloatMods, HasSrc1FloatMods,
HasSrc2FloatMods>.ret);
- let AsmVOP3DPP16 = !subst(", $src2_modifiers", "",
- getAsmVOP3DPP16<getAsmVOP3Base<3, 1, HasClamp, 1,
- HasOMod, 0, 1, HasSrc0FloatMods,
- HasSrc1FloatMods,
- HasSrc2FloatMods>.ret>.ret);
- let AsmVOP3DPP8 = !subst(", $src2_modifiers", "",
- getAsmVOP3DPP8<getAsmVOP3Base<3, 1, HasClamp, 1,
- HasOMod, 0, 1, HasSrc0FloatMods,
- HasSrc1FloatMods,
- HasSrc2FloatMods>.ret>.ret);
+ let AsmVOP3Base = !subst(", $src2_modifiers", "",
+ getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
+ HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, 0/*Src1Mods*/,
+ HasModifiers, DstVT>.ret);
}
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
@@ -636,8 +630,8 @@ let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
-defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
-defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
+defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
+defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
@@ -752,6 +746,8 @@ def : GCNPat<(DivergentBinFrag<or> (or_oneuse i64:$src0, i64:$src1), i64:$src2),
(i32 (EXTRACT_SUBREG $src1, sub1)),
(i32 (EXTRACT_SUBREG $src2, sub1))), sub1)>;
+} // End SubtargetPredicate = isGFX9Plus
+
// FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
class OpSelBinOpClampPat<SDPatternOperator node,
Instruction inst> : GCNPat<
@@ -760,9 +756,14 @@ class OpSelBinOpClampPat<SDPatternOperator node,
(inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE, 0)
>;
-def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
-def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
-} // End SubtargetPredicate = isGFX9Plus
+let SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts in {
+ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
+ def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
+} // End SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts
+let True16Predicate = UseFakeTrue16Insts in {
+ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_fake16_e64>;
+ def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_fake16_e64>;
+} // End True16Predicate = UseFakeTrue16Insts
multiclass IMAD32_Pats <VOP3_Pseudo inst> {
def : GCNPat <
@@ -871,21 +872,31 @@ let SubtargetPredicate = isGFX10Plus in {
def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
}
- defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
- defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
-
- def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
- def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
-
- // Undo sub x, c -> add x, -c canonicalization since c is more likely
- // an inline immediate than -c.
- def : GCNPat<
- (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
- (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
- >;
+ defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
+ defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>;
} // End SubtargetPredicate = isGFX10Plus
+let True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus in {
+ def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
+ def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
+ // Undo sub x, c -> add x, -c canonicalization since c is more likely
+ // an inline immediate than -c.
+ def : GCNPat<
+ (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+ (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+ >;
+} // End True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus
+
+let True16Predicate = UseFakeTrue16Insts in {
+ def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_fake16_e64>;
+ def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_fake16_e64>;
+ def : GCNPat<
+ (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+ (V_SUB_NC_U16_fake16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+ >;
+} // End True16Predicate = UseFakeTrue16Insts
+
let SubtargetPredicate = isGFX12Plus in {
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
defm V_PERMLANE16_VAR_B32 : VOP3Inst<"v_permlane16_var_b32", VOP3_PERMLANE_VAR_Profile>;
@@ -1104,6 +1115,17 @@ multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op> :
VOP3Dot_Realtriple<GFX11Gen, op>, VOP3Dot_Realtriple<GFX12Gen, op>;
+multiclass VOP3_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME,
+ string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+ VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME,
+ string pseudo_mnemonic = "", bit isSingle = 0> {
+ defm opName#"_t16": VOP3_Realtriple_t16_gfx11_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+ defm opName#"_fake16": VOP3_Realtriple_t16_gfx11_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
+}
+
multiclass VOP3be_Real_gfx11_gfx12<bits<10> op, string opName, string asmName> :
VOP3be_Real<GFX11Gen, op, opName, asmName>,
VOP3be_Real<GFX12Gen, op, opName, asmName>;
@@ -1189,8 +1211,8 @@ defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "
defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11_gfx12<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">;
-defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11_gfx12<0x303>;
-defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11_gfx12<0x304>;
+defm V_ADD_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x303, "v_add_nc_u16">;
+defm V_SUB_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x304, "v_sub_nc_u16">;
defm V_MUL_LO_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x305, "v_mul_lo_u16">;
defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11_gfx12<0x306>;
defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11_gfx12<0x307>;
@@ -1198,8 +1220,8 @@ defm V_MAX_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30
defm V_MAX_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30a, "v_max_i16">;
defm V_MIN_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30b, "v_min_u16">;
defm V_MIN_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30c, "v_min_i16">;
-defm V_ADD_NC_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x30d, "V_ADD_I16", "v_add_nc_i16">;
-defm V_SUB_NC_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
+defm V_ADD_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30d, "v_add_nc_i16", "V_ADD_I16">;
+defm V_SUB_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30e, "v_sub_nc_i16", "V_SUB_I16">;
defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11_gfx12<0x311>;
defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >;
defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 05a7d90..aab5dc7 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -111,7 +111,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
bit HasFP8DstByteSel = P.HasFP8DstByteSel;
- let AsmOperands = !if(isVop3OpSel,
+ let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
P.AsmVOP3OpSel,
!if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
@@ -178,6 +178,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
let SubtargetPredicate = ps.SubtargetPredicate;
let WaveSizePredicate = ps.WaveSizePredicate;
let OtherPredicates = ps.OtherPredicates;
+ let True16Predicate = ps.True16Predicate;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
@@ -242,6 +243,41 @@ class VOP3a<VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
}
+// To avoid having different version of every type of operand depending on if
+// they are part of a True16 instruction or not, the operand encoding should be
+// the same for SGPR, imm, and VGPR_32 whether the instruction is True16 or not.
+class VOP3a_t16<VOPProfile P> : Enc64 {
+ bits<11> vdst;
+ bits<4> src0_modifiers;
+ bits<11> src0;
+ bits<3> src1_modifiers;
+ bits<11> src1;
+ bits<3> src2_modifiers;
+ bits<11> src2;
+ bits<1> clamp;
+ bits<2> omod;
+
+ let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+ let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
+ let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
+ // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix
+ let Inst{11} = !if(P.HasSrc0Mods, src0_modifiers{2}, 0);
+ let Inst{12} = !if(P.HasSrc1Mods, src1_modifiers{2}, 0);
+ let Inst{13} = !if(P.HasSrc2Mods, src2_modifiers{2}, 0);
+ let Inst{14} = !if(!and(P.HasDst, P.HasSrc0Mods), src0_modifiers{3}, 0);
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+
+ let Inst{31-26} = 0x35;
+ let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+ let Inst{60-59} = !if(P.HasOMod, omod, 0);
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
class VOP3a_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a<p> {
let Inst{11} = !if(p.HasClamp, clamp{0}, 0);
let Inst{25-17} = op;
@@ -272,6 +308,10 @@ class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> {
class VOP3e_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>;
+class VOP3e_t16_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3a_t16<p> {
+ let Inst{25-16} = op;
+}
+
class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
bits<8> vdst;
let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
@@ -736,7 +776,12 @@ class VOP3_DPPe_Fields : VOP3_DPPe_Fields_Base {
bits<8> src0;
}
+class VOP3_DPPe_Fields_t16 : VOP3_DPPe_Fields_Base {
+ bits<11> src0;
+}
+
// Common refers to common between DPP and DPP8
+// Base refers to a shared base between T16 and regular instructions
class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
bits<4> src0_modifiers;
bits<3> src1_modifiers;
@@ -748,7 +793,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
- // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
+ // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix
let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
!if(P.IsFP8SrcByteSel, byte_sel{1}, ?));
let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
@@ -777,6 +822,16 @@ class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P>
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
}
+class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> {
+ bits<11> vdst;
+ bits<11> src1;
+ bits<11> src2;
+
+ let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+}
+
class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
bits<4> src0_modifiers;
bits<4> src1_modifiers;
@@ -786,6 +841,7 @@ class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+ // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
@@ -810,6 +866,16 @@ class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
}
+class VOP3P_DPPe_Common_t16<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
+ bits<11> vdst;
+ bits<11> src1;
+ bits<11> src2;
+
+ let Inst{7-0} = vdst{7-0};
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+}
+
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
VOP_Pseudo<OpName, "_dpp", P, P.OutsDPP, Ins, asmOps, pattern> {
@@ -870,6 +936,7 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
// Copy relevant pseudo op flags
let isConvergent = ps.isConvergent;
let SubtargetPredicate = ps.SubtargetPredicate;
+ let True16Predicate = ps.True16Predicate;
let AssemblerPredicate = ps.AssemblerPredicate;
let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
@@ -928,11 +995,29 @@ class VOP3_DPP_Base <string OpName, VOPProfile P, bit IsDPP16,
let Size = 12;
}
+class VOP3_DPP_Enc <bits<10> op, VOPProfile P, bit IsDPP16> :
+ VOP3_DPPe_Common<op, P>,
+ VOP3_DPPe_Fields {
+
+ let Inst{40-32} = 0xfa;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{80-72} = dpp_ctrl;
+ let Inst{82} = !if(IsDPP16, fi, ?);
+ let Inst{83} = bound_ctrl;
+
+ // Inst{87-84} ignored by hw
+ let Inst{91-88} = bank_mask;
+ let Inst{95-92} = row_mask;
+}
+
class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
- VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPPe_Common<op, P>,
- VOP3_DPPe_Fields {
+ VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPP_Enc<op, P, IsDPP16>;
+
+class VOP3_DPP_Enc_t16<bits<10> op, VOPProfile P, bit IsDPP16 >
+ : VOP3_DPPe_Common_t16<op, P>,
+ VOP3_DPPe_Fields_t16 {
let Inst{40-32} = 0xfa;
let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
@@ -945,6 +1030,13 @@ class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
let Inst{95-92} = row_mask;
}
+class VOP3_DPP_t16<bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
+ dag InsDPP = !if (IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+ string AsmDPP = !if (IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)>
+ : VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>,
+ VOP3_DPP_Enc_t16<op, P, IsDPP16> {
+}
+
class VOP3P_DPP <bits<7> op, string OpName, VOPProfile P, bit IsDPP16,
dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
@@ -979,6 +1071,12 @@ class VOP3_DPP8e_Fields {
bits<9> fi;
}
+class VOP3_DPP8e_Fields_t16 {
+ bits<11> src0;
+ bits<24> dpp8;
+ bits<9> fi;
+}
+
class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string AsmDPP8 = P.AsmDPP8> :
InstSI<P.OutsDPP8, InsDPP8, OpName#AsmDPP8, []> {
@@ -1011,16 +1109,28 @@ class VOP3_DPP8_Base<string OpName, VOPProfile P> :
let Size = 12;
}
+class VOP3_DPP8_Enc <bits<10> op, VOPProfile P> :
+ VOP3_DPPe_Common<op, P>,
+ VOP3_DPP8e_Fields {
+ let Inst{40-32} = fi;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{95-72} = dpp8{23-0};
+}
class VOP3_DPP8<bits<10> op, string OpName, VOPProfile P> :
- VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P>,
- VOP3_DPP8e_Fields {
+ VOP3_DPP8_Base<OpName, P>, VOP3_DPP8_Enc<op, P>;
+class VOP3_DPP8_Enc_t16 <bits<10> op, VOPProfile P> :
+ VOP3_DPPe_Common_t16<op, P>,
+ VOP3_DPP8e_Fields_t16 {
let Inst{40-32} = fi;
let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
let Inst{95-72} = dpp8{23-0};
}
+class VOP3_DPP8_t16<bits<10> op, string OpName, VOPProfile P> :
+ VOP3_DPP8_Base<OpName, P>, VOP3_DPP8_Enc_t16<op, P>;
+
class VOP3P_DPP8<bits<7> op, string OpName, VOPProfile P> :
VOP3_DPP8_Base<OpName, P>, VOP3P_DPPe_Common<op, P>,
VOP3_DPP8e_Fields {
@@ -1273,6 +1383,30 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Pr
}
+class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile_True16<P> {
+ let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+ let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+ let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
+ let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+ let HasModifiers =
+ !if (Features.IsMAI, 0,
+ !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
+ let IsSingle = 1;
+}
+
+class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile_Fake16<P> {
+ let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+ let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+ let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
+ let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+ let HasModifiers =
+ !if (Features.IsMAI, 0,
+ !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
+ let IsSingle = 1;
+}
+
// consistently gives instructions a _e64 suffix
multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> {
def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
@@ -1325,11 +1459,33 @@ multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P,
i32:$omod))))]>;
}
+multiclass VOP3Inst_t16_with_profiles<string OpName, VOPProfile P, VOPProfile P_t16,
+ VOPProfile P_fake16,
+ SDPatternOperator node = null_frag,
+ SDPatternOperator node_t16 = node> {
+ let True16Predicate = NotHasTrue16BitInsts in {
+ defm NAME : VOP3Inst<OpName, P, node>;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ defm _t16 : VOP3Inst<OpName # "_t16", P_t16, node_t16>;
+ }
+ let True16Predicate = UseFakeTrue16Insts in {
+ defm _fake16 : VOP3Inst<OpName # "_fake16", P_fake16, node>;
+ }
+}
+
+multiclass VOP3Inst_t16<string OpName, VOPProfile P,
+ SDPatternOperator node = null_frag,
+ SDPatternOperator node_t16 = node>
+ : VOP3Inst_t16_with_profiles<OpName, VOP3_Profile<P, VOP3_OPSEL>,
+ VOP3_Profile_True16<P, VOP3_OPSEL>, VOP3_Profile_Fake16<P, VOP3_OPSEL>,
+ node, node_t16>;
+
//===----------------------------------------------------------------------===//
// VOP3 DPP
//===----------------------------------------------------------------------===//
-class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+class VOP3_DPP16_Helper<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
: VOP3_DPP<op, opName, ps.Pfl, 1> {
let VOP3_OPSEL = ps.Pfl.HasOpSel;
let IsDOT = ps.IsDOT;
@@ -1342,17 +1498,43 @@ class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
let OtherPredicates = ps.OtherPredicates;
}
+class VOP3_DPP16_t16_Helper<bits<10> op, VOP_DPP_Pseudo ps,
+ string opName = ps.OpName>
+ : VOP3_DPP_t16<op, opName, ps.Pfl, 1> {
+ let VOP3_OPSEL = ps.Pfl.HasOpSel;
+ let IsDOT = ps.IsDOT;
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let AssemblerPredicate = HasDPP16;
+ let SubtargetPredicate = HasDPP16;
+ let OtherPredicates = ps.OtherPredicates;
+}
+
class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
string opName = ps.OpName>
- : Base_VOP3_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+ : VOP3_DPP16_Helper<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+
+class VOP3_DPP16_t16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
+ string opName = ps.OpName>
+ : VOP3_DPP16_t16_Helper<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
- string opName = ps.OpName> :
- VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
+ string opName = ps.OpName>
+ : VOP3_DPP16<op, ps, Gen.Subtarget, opName> {
let AssemblerPredicate = Gen.AssemblerPredicate;
- let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
- let DecoderNamespace = Gen.DecoderNamespace#
- !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
+ let DecoderNamespace = Gen.DecoderNamespace;
+}
+
+class VOP3_DPP16_Gen_t16<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
+ string opName = ps.OpName>
+ : VOP3_DPP16_t16<op, ps, Gen.Subtarget, opName> {
+ let True16Predicate =
+ !if (ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace =
+ Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16");
}
class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1366,11 +1548,25 @@ class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
let SubtargetPredicate = ps.SubtargetPredicate;
let OtherPredicates = ps.OtherPredicates;
+ let True16Predicate = ps.True16Predicate;
+}
+
+class Base_VOP3_DPP8_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : VOP3_DPP8_t16<op, opName, ps.Pfl> {
+ let VOP3_OPSEL = ps.Pfl.HasOpSel;
+ let IsDOT = ps.IsDOT;
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+
+ let OtherPredicates = ps.OtherPredicates;
+ let True16Predicate = ps.True16Predicate;
}
class Base_VOP3b_DPP16<bits<10> op, VOP_DPP_Pseudo ps,
string opName = ps.OpName>
- : Base_VOP3_DPP16<op, ps, opName> {
+ : VOP3_DPP16_Helper<op, ps, opName> {
bits<7> sdst;
let Inst{14 - 8} = sdst;
}
@@ -1381,6 +1577,12 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
let Inst{14 - 8} = sdst;
}
+class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : Base_VOP3_DPP8<op, ps, opName> {
+ bits<8> sdst;
+ let Inst{14 - 8} = sdst{7-1};
+}
+
//===----------------------------------------------------------------------===//
// VOP3 GFX11, GFX12
//===----------------------------------------------------------------------===//
@@ -1420,10 +1622,11 @@ multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
}
multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
- string asmName, bit isSingle = 0> {
+ string asmName, string pseudo_mnemonic = "", bit isSingle = 0> {
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands,
IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+ // FIXME-TRUE16 support FP8 instructions properly
if ps.Pfl.IsFP8SrcByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
@@ -1432,17 +1635,27 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
- } else if ps.Pfl.HasOpSel then {
- def _e64#Gen.Suffix :
- VOP3_Real_Gen<ps, Gen>,
- VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
} else {
- def _e64#Gen.Suffix :
- VOP3_Real_Gen<ps, Gen>,
- VOP3e_gfx11_gfx12<op, ps.Pfl>;
+ if ps.Pfl.IsRealTrue16 then {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3e_t16_gfx11_gfx12<op, ps.Pfl>;
+ } else {
+ if ps.Pfl.HasOpSel then {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+ } else {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3e_gfx11_gfx12<op, ps.Pfl>;
+ }
+ }
}
}
- def Gen.Suffix#"_VOP3_alias" : LetDummies, AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+ def Gen.Suffix#"_VOP3_alias" : LetDummies,
+ AMDGPUMnemonicAlias<!if(!empty(pseudo_mnemonic),
+ ps.Mnemonic, pseudo_mnemonic), asmName, ""> {
let AssemblerPredicate = Gen.AssemblerPredicate;
}
}
@@ -1456,8 +1669,13 @@ multiclass VOP3_Real_No_Suffix<GFXGen Gen, bits<10> op, string opName = NAME> {
}
multiclass VOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
- def _e64_dpp#Gen.Suffix :
- VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen>;
+ defvar ps = !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp");
+ if ps.Pfl.IsTrue16 then
+ def _e64_dpp#Gen.Suffix :
+ VOP3_DPP16_Gen_t16<op, ps, Gen>;
+ else
+ def _e64_dpp#Gen.Suffix :
+ VOP3_DPP16_Gen<op, ps, Gen>;
}
multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
@@ -1552,18 +1770,14 @@ multiclass VOP3Only_Realtriple<GFXGen Gen, bits<10> op> :
VOP3_Realtriple<Gen, op, 1>;
multiclass VOP3_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName,
- string asmName, bit isSingle = 0> :
- VOP3_Real_with_name<Gen, op, opName, asmName, isSingle>,
+ string asmName, string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3_Real_with_name<Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
VOP3_Real_dpp_with_name<Gen, op, opName, asmName>,
VOP3_Real_dpp8_with_name<Gen, op, opName, asmName>;
multiclass VOP3Only_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName,
string asmName> :
- VOP3_Realtriple_with_name<Gen, op, opName, asmName, 1>;
-
-multiclass VOP3Only_Realtriple_t16<GFXGen Gen, bits<10> op, string asmName,
- string opName = NAME>
- : VOP3Only_Realtriple_with_name<Gen, op, opName, asmName>;
+ VOP3_Realtriple_with_name<Gen, op, opName, asmName, "", 1>;
multiclass VOP3be_Realtriple<
GFXGen Gen, bits<10> op, bit isSingle = 0, string opName = NAME,
@@ -1579,6 +1793,16 @@ multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> :
// VOP3 GFX11
//===----------------------------------------------------------------------===//
+// VOP1 and VOP2 depend on these triple defs
+
+multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName = NAME,
+ string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
+ string opName = NAME, string pseudo_mnemonic = "">
+ : VOP3_Realtriple_t16_gfx11<op, asmName, opName, pseudo_mnemonic, 1>;
+
multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
bit isSingle = 0> :
VOP3be_Real<GFX11Gen, op, opName, asmName, isSingle>;
@@ -1591,10 +1815,6 @@ multiclass VOP3_Realtriple_gfx11<bits<10> op, bit isSingle = 0,
string opName = NAME> :
VOP3_Realtriple<GFX11Gen, op, isSingle, opName>;
-multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
- string opName = NAME>
- : VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>;
-
//===----------------------------------------------------------------------===//
// VOP3 GFX12
//===----------------------------------------------------------------------===//
@@ -1610,6 +1830,16 @@ multiclass VOP3Only_Real_Base_gfx12<bits<10> op> :
multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op> :
VOP3Only_Realtriple<GFX12Gen, op>;
+multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME,
+ string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, string opName = NAME,
+ string pseudo_mnemonic = "", bit isSingle = 0> {
+ defm opName#"_t16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+ defm opName#"_fake16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
+}
+
multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
string asmName, bit isSingle = 0> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -1624,18 +1854,14 @@ multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
}
multiclass VOP3_Realtriple_with_name_gfx12<bits<10> op, string opName,
- string asmName, bit isSingle = 0> :
- VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, isSingle>;
+ string asmName, string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
string asmName> :
VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
-multiclass VOP3Only_Realtriple_with_name_t16_gfx12<bits<10> op, string asmName,
- string opName = NAME>
- : VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
-
//===----------------------------------------------------------------------===//
include "VOPCInstructions.td"
@@ -1705,4 +1931,4 @@ def VOPTrue16Table : GenericTable {
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getTrue16OpcodeHelper";
-} \ No newline at end of file
+}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a35582b..a49dda87 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21445,9 +21445,7 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
IRBuilderBase &Builder) const {
if (!Subtarget->hasV7Ops())
return;
- Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_clrex));
+ Builder.CreateIntrinsic(Intrinsic::arm_clrex, {}, {});
}
Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 75fb904..b908e4f3 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -2532,14 +2532,14 @@ public:
void addCondCodeOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
- unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR;
+ unsigned RegNum = getCondCode() == ARMCC::AL ? ARM::NoRegister : ARM::CPSR;
Inst.addOperand(MCOperand::createReg(RegNum));
}
void addVPTPredNOperands(MCInst &Inst, unsigned N) const {
assert(N == 3 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(unsigned(getVPTPred())));
- unsigned RegNum = getVPTPred() == ARMVCC::None ? 0: ARM::P0;
+ unsigned RegNum = getVPTPred() == ARMVCC::None ? ARM::NoRegister : ARM::P0;
Inst.addOperand(MCOperand::createReg(RegNum));
Inst.addOperand(MCOperand::createReg(0));
}
@@ -7164,8 +7164,8 @@ bool ARMAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Add the carry setting operand, if necessary.
if (CanAcceptCarrySet && CarrySetting) {
SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
- Operands.push_back(
- ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0, Loc, *this));
+ Operands.push_back(ARMOperand::CreateCCOut(
+ CarrySetting ? ARM::CPSR : ARM::NoRegister, Loc, *this));
}
// Add the predication code operand, if necessary.
@@ -10372,7 +10372,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
case ARM::t2ASRri:
if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
isARMLowRegister(Inst.getOperand(1).getReg()) &&
- Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+ Inst.getOperand(5).getReg() ==
+ (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
!HasWideQualifier) {
unsigned NewOpc;
switch (Inst.getOpcode()) {
@@ -10422,14 +10423,14 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
TmpInst.addOperand(Inst.getOperand(0)); // Rd
if (isNarrow)
TmpInst.addOperand(MCOperand::createReg(
- Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+ Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : ARM::NoRegister));
TmpInst.addOperand(Inst.getOperand(1)); // Rn
TmpInst.addOperand(Inst.getOperand(2)); // Rm
TmpInst.addOperand(Inst.getOperand(4)); // CondCode
TmpInst.addOperand(Inst.getOperand(5));
if (!isNarrow)
TmpInst.addOperand(MCOperand::createReg(
- Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+ Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : ARM::NoRegister));
Inst = TmpInst;
return true;
}
@@ -10475,7 +10476,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
TmpInst.addOperand(Inst.getOperand(0)); // Rd
if (isNarrow && !isMov)
TmpInst.addOperand(MCOperand::createReg(
- Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+ Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : ARM::NoRegister));
TmpInst.addOperand(Inst.getOperand(1)); // Rn
if (newOpc != ARM::t2RRX && !isMov)
TmpInst.addOperand(MCOperand::createImm(Amount));
@@ -10483,7 +10484,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
TmpInst.addOperand(Inst.getOperand(4));
if (!isNarrow)
TmpInst.addOperand(MCOperand::createReg(
- Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+ Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : ARM::NoRegister));
Inst = TmpInst;
return true;
}
@@ -10684,7 +10685,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
!isARMLowRegister(Inst.getOperand(0).getReg()) ||
(Inst.getOperand(2).isImm() &&
(unsigned)Inst.getOperand(2).getImm() > 255) ||
- Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) ||
+ Inst.getOperand(5).getReg() !=
+ (inITBlock() ? ARM::NoRegister : ARM::CPSR) ||
HasWideQualifier)
break;
MCInst TmpInst;
@@ -10852,7 +10854,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
(Inst.getOperand(1).isImm() &&
(unsigned)Inst.getOperand(1).getImm() <= 255) &&
- Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+ Inst.getOperand(4).getReg() ==
+ (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
!HasWideQualifier) {
// The operands aren't in the same order for tMOVi8...
MCInst TmpInst;
@@ -10993,7 +10996,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
isARMLowRegister(Inst.getOperand(2).getReg())) &&
Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
- Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+ Inst.getOperand(5).getReg() ==
+ (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
!HasWideQualifier) {
unsigned NewOpc;
switch (Inst.getOpcode()) {
@@ -11029,7 +11033,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
isARMLowRegister(Inst.getOperand(2).getReg())) &&
(Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
- Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+ Inst.getOperand(5).getReg() ==
+ (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
!HasWideQualifier) {
unsigned NewOpc;
switch (Inst.getOpcode()) {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 93b7490..fa5dd10 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -894,12 +894,13 @@ void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const {
MCID.operands()[i].RegClass == ARM::CCRRegClassID) {
if (i > 0 && MCID.operands()[i - 1].isPredicate())
continue;
- MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+ MI.insert(I,
+ MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
return;
}
}
- MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+ MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
}
bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const {
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index be714b5c..8ea3140 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -18,6 +18,8 @@ using namespace llvm;
bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
unsigned ScalarOpdIdx) {
switch (ID) {
+ case Intrinsic::dx_wave_readlane:
+ return ScalarOpdIdx == 1;
default:
return false;
}
@@ -28,6 +30,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
switch (ID) {
case Intrinsic::dx_frac:
case Intrinsic::dx_rsqrt:
+ case Intrinsic::dx_wave_readlane:
return true;
default:
return false;
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 3041c16..bb76cfd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -435,6 +435,9 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
continue;
}
+ if (isa<MemTransferInst>(CurUser))
+ continue;
+
// supported for grid_constant
if (IsGridConstant &&
(isa<CallInst>(CurUser) || isa<StoreInst>(CurUser) ||
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 911d92f..cec1e50 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12205,11 +12205,8 @@ Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
// and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
if (isa<LoadInst>(Inst))
- return Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(
- Builder.GetInsertBlock()->getParent()->getParent(),
- Intrinsic::ppc_cfence, {Inst->getType()}),
- {Inst});
+ return Builder.CreateIntrinsic(Intrinsic::ppc_cfence, {Inst->getType()},
+ {Inst});
// FIXME: Can use isync for rmw operation.
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index dd07892..fe9ab22 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3606,6 +3606,10 @@ def : Pat<(i64 (lround f64:$S)),
(i64 (MFVSRD (FCTID (XSRDPI $S))))>;
def : Pat<(i64 (lround f32:$S)),
(i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
+def : Pat<(i32 (lround f64:$S)),
+ (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>;
+def : Pat<(i32 (lround f32:$S)),
+ (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
def : Pat<(i64 (llround f64:$S)),
(i64 (MFVSRD (FCTID (XSRDPI $S))))>;
def : Pat<(i64 (llround f32:$S)),
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 7d04559..cd18830 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -521,7 +521,7 @@ bool PPCPassConfig::addPreISel() {
}
bool PPCPassConfig::addILPOpts() {
- addPass(&EarlyIfConverterID);
+ addPass(&EarlyIfConverterLegacyID);
if (EnableMachineCombinerPass)
addPass(&MachineCombinerID);
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index d77ad02..0bc3584 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -3693,6 +3693,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
switch (Inst.getOpcode()) {
default:
break;
+ case RISCV::PseudoC_ADDI_NOP:
+ emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP));
+ return false;
case RISCV::PseudoLLAImm:
case RISCV::PseudoLAImm:
case RISCV::PseudoLI: {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index bf333b7b..14249e3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -717,6 +717,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN,
ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMINIMUM, ISD::VECREDUCE_FMAXIMUM};
+ static const unsigned FloatingPointLibCallOps[] = {
+ ISD::FREM, ISD::FPOW, ISD::FCOS, ISD::FSIN, ISD::FSINCOS, ISD::FEXP,
+ ISD::FEXP2, ISD::FEXP10, ISD::FLOG, ISD::FLOG2, ISD::FLOG10};
+
if (!Subtarget.is64Bit()) {
// We must custom-lower certain vXi64 operations on RV32 due to the vector
// element type being illegal.
@@ -1002,17 +1006,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(FloatingPointVecReduceOps, VT, Custom);
// Expand FP operations that need libcalls.
- setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FPOW, VT, Expand);
- setOperationAction(ISD::FCOS, VT, Expand);
- setOperationAction(ISD::FSIN, VT, Expand);
- setOperationAction(ISD::FSINCOS, VT, Expand);
- setOperationAction(ISD::FEXP, VT, Expand);
- setOperationAction(ISD::FEXP2, VT, Expand);
- setOperationAction(ISD::FEXP10, VT, Expand);
- setOperationAction(ISD::FLOG, VT, Expand);
- setOperationAction(ISD::FLOG2, VT, Expand);
- setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(FloatingPointLibCallOps, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Legal);
@@ -1076,7 +1070,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS,
ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR,
ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE,
- ISD::VECTOR_REVERSE},
+ ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE},
VT, Custom);
MVT EltVT = VT.getVectorElementType();
if (isTypeLegal(EltVT))
@@ -1097,6 +1091,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FABS, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ // Expand FP operations that need libcalls.
+ setOperationAction(FloatingPointLibCallOps, VT, Expand);
+
// Custom split nxv32[b]f16 since nxv32[b]f32 is not legal.
if (getLMUL(VT) == RISCVII::VLMUL::LMUL_8) {
setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index b8539a5..3989a96 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4102,3 +4102,17 @@ unsigned RISCV::getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW) {
assert(Scaled >= 3 && Scaled <= 6);
return Scaled;
}
+
+/// Given two VL operands, do we know that LHS <= RHS?
+bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
+ if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
+ LHS.getReg() == RHS.getReg())
+ return true;
+ if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
+ return true;
+ if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
+ return false;
+ if (!LHS.isImm() || !RHS.isImm())
+ return false;
+ return LHS.getImm() <= RHS.getImm();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 457db9b..c3aa367 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -346,6 +346,9 @@ unsigned getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW);
// Special immediate for AVL operand of V pseudo instructions to indicate VLMax.
static constexpr int64_t VLMaxSentinel = -1LL;
+/// Given two VL operands, do we know that LHS <= RHS?
+bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS);
+
// Mask assignments for floating-point
static constexpr unsigned FPMASK_Negative_Infinity = 0x001;
static constexpr unsigned FPMASK_Negative_Normal = 0x002;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index e8c4860..8a76dba 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -418,15 +418,11 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
let Inst{6-2} = imm{4-0};
}
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_ADDI_NOP : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
- (ins GPRX0:$rd, immzero:$imm),
- "c.addi", "$rd, $imm">,
- Sched<[WriteIALU, ReadIALU]> {
- let Constraints = "$rd = $rd_wb";
- let Inst{6-2} = 0;
- let isAsmParserOnly = 1;
-}
+// Alternate syntax for c.nop. Converted to C_NOP by the assembler.
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
+ isAsmParserOnly = 1 in
+def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, immzero:$imm),
+ [], "c.addi", "$rd, $imm">;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
DecoderNamespace = "RISCV32Only_", Defs = [X1],
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index cba73ab..df5c6b5 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1130,6 +1130,10 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return getCmpSelInstrCost(*FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
CmpInst::BAD_ICMP_PREDICATE, CostKind);
}
+ case Intrinsic::vp_merge:
+ return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
+ ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE,
+ CostKind);
}
if (ST->hasVInstructions() && RetTy->isVectorTy()) {
@@ -2429,4 +2433,4 @@ bool RISCVTTIImpl::isProfitableToSinkOperands(
Ops.push_back(&OpIdx.value());
}
return true;
-} \ No newline at end of file
+}
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 53373b7..ee494c4 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -51,7 +51,7 @@ public:
StringRef getPassName() const override { return PASS_NAME; }
private:
- bool checkUsers(std::optional<Register> &CommonVL, MachineInstr &MI);
+ bool checkUsers(const MachineOperand *&CommonVL, MachineInstr &MI);
bool tryReduceVL(MachineInstr &MI);
bool isCandidate(const MachineInstr &MI) const;
};
@@ -563,7 +563,12 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VREM_VV:
case RISCV::VREM_VX:
// Vector Widening Integer Multiply Instructions
- // FIXME: Add support
+ case RISCV::VWMUL_VV:
+ case RISCV::VWMUL_VX:
+ case RISCV::VWMULSU_VV:
+ case RISCV::VWMULSU_VX:
+ case RISCV::VWMULU_VV:
+ case RISCV::VWMULU_VX:
// Vector Single-Width Integer Multiply-Add Instructions
// FIXME: Add support
// Vector Widening Integer Multiply-Add Instructions
@@ -653,10 +658,34 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
if (MI.getNumDefs() != 1)
return false;
+ // If we're not using VLMAX, then we need to be careful whether we are using
+ // TA/TU when there is a non-undef Passthru. But when we are using VLMAX, it
+ // does not matter whether we are using TA/TU with a non-undef Passthru, since
+ // there are no tail elements to be perserved.
unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
const MachineOperand &VLOp = MI.getOperand(VLOpNum);
- if (!VLOp.isImm() || VLOp.getImm() != RISCV::VLMaxSentinel)
+ if (VLOp.isReg() || VLOp.getImm() != RISCV::VLMaxSentinel) {
+ // If MI has a non-undef passthru, we will not try to optimize it since
+ // that requires us to preserve tail elements according to TA/TU.
+ // Otherwise, The MI has an undef Passthru, so it doesn't matter whether we
+ // are using TA/TU.
+ bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc);
+ unsigned PassthruOpIdx = MI.getNumExplicitDefs();
+ if (HasPassthru &&
+ MI.getOperand(PassthruOpIdx).getReg() != RISCV::NoRegister) {
+ LLVM_DEBUG(
+ dbgs() << " Not a candidate because it uses non-undef passthru"
+ " with non-VLMAX VL\n");
+ return false;
+ }
+ }
+
+ // If the VL is 1, then there is no need to reduce it. This is an
+ // optimization, not needed to preserve correctness.
+ if (VLOp.isImm() && VLOp.getImm() == 1) {
+ LLVM_DEBUG(dbgs() << " Not a candidate because VL is already 1\n");
return false;
+ }
// Some instructions that produce vectors have semantics that make it more
// difficult to determine whether the VL can be reduced. For example, some
@@ -679,7 +708,7 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
return true;
}
-bool RISCVVLOptimizer::checkUsers(std::optional<Register> &CommonVL,
+bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL,
MachineInstr &MI) {
// FIXME: Avoid visiting each user for each time we visit something on the
// worklist, combined with an extra visit from the outer loop. Restructure
@@ -725,16 +754,17 @@ bool RISCVVLOptimizer::checkUsers(std::optional<Register> &CommonVL,
unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
- // Looking for a register VL that isn't X0.
- if (!VLOp.isReg() || VLOp.getReg() == RISCV::X0) {
- LLVM_DEBUG(dbgs() << " Abort due to user uses X0 as VL.\n");
- CanReduceVL = false;
- break;
- }
+
+ // Looking for an immediate or a register VL that isn't X0.
+ assert(!VLOp.isReg() ||
+ VLOp.getReg() != RISCV::X0 && "Did not expect X0 VL");
if (!CommonVL) {
- CommonVL = VLOp.getReg();
- } else if (*CommonVL != VLOp.getReg()) {
+ CommonVL = &VLOp;
+ LLVM_DEBUG(dbgs() << " User VL is: " << VLOp << "\n");
+ } else if (!CommonVL->isIdenticalTo(VLOp)) {
+ // FIXME: This check requires all users to have the same VL. We can relax
+ // this and get the largest VL amongst all users.
LLVM_DEBUG(dbgs() << " Abort because users have different VL\n");
CanReduceVL = false;
break;
@@ -771,7 +801,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
MachineInstr &MI = *Worklist.pop_back_val();
LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n");
- std::optional<Register> CommonVL;
+ const MachineOperand *CommonVL = nullptr;
bool CanReduceVL = true;
if (isVectorRegClass(MI.getOperand(0).getReg(), MRI))
CanReduceVL = checkUsers(CommonVL, MI);
@@ -779,21 +809,34 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
if (!CanReduceVL || !CommonVL)
continue;
- if (!CommonVL->isVirtual()) {
- LLVM_DEBUG(
- dbgs() << " Abort due to new VL is not virtual register.\n");
+ assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) &&
+ "Expected VL to be an Imm or virtual Reg");
+
+ unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc());
+ MachineOperand &VLOp = MI.getOperand(VLOpNum);
+
+ if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) {
+ LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n");
continue;
}
- const MachineInstr *VLMI = MRI->getVRegDef(*CommonVL);
- if (!MDT->dominates(VLMI, &MI))
- continue;
+ if (CommonVL->isImm()) {
+ LLVM_DEBUG(dbgs() << " Reduce VL from " << VLOp << " to "
+ << CommonVL->getImm() << " for " << MI << "\n");
+ VLOp.ChangeToImmediate(CommonVL->getImm());
+ } else {
+ const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg());
+ if (!MDT->dominates(VLMI, &MI))
+ continue;
+ LLVM_DEBUG(
+ dbgs() << " Reduce VL from " << VLOp << " to "
+ << printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo())
+ << " for " << MI << "\n");
+
+ // All our checks passed. We can reduce VL.
+ VLOp.ChangeToRegister(CommonVL->getReg(), false);
+ }
- // All our checks passed. We can reduce VL.
- LLVM_DEBUG(dbgs() << " Reducing VL for: " << MI << "\n");
- unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc());
- MachineOperand &VLOp = MI.getOperand(VLOpNum);
- VLOp.ChangeToRegister(*CommonVL, false);
MadeChange = true;
// Now add all inputs to this instruction to the worklist.
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index b883c50..a57bc5a 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -86,20 +86,6 @@ char RISCVVectorPeephole::ID = 0;
INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false,
false)
-/// Given two VL operands, do we know that LHS <= RHS?
-static bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
- if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
- LHS.getReg() == RHS.getReg())
- return true;
- if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
- return true;
- if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
- return false;
- if (!LHS.isImm() || !RHS.isImm())
- return false;
- return LHS.getImm() <= RHS.getImm();
-}
-
/// Given \p User that has an input operand with EEW=SEW, which uses the dest
/// operand of \p Src with an unknown EEW, return true if their EEWs match.
bool RISCVVectorPeephole::hasSameEEW(const MachineInstr &User,
@@ -191,7 +177,7 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
return false;
MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
- if (VL.isIdenticalTo(SrcVL) || !isVLKnownLE(VL, SrcVL))
+ if (VL.isIdenticalTo(SrcVL) || !RISCV::isVLKnownLE(VL, SrcVL))
return false;
if (!ensureDominates(VL, *Src))
@@ -580,7 +566,7 @@ bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) {
MachineOperand &SrcPolicy =
Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc()));
- if (isVLKnownLE(MIVL, SrcVL))
+ if (RISCV::isVLKnownLE(MIVL, SrcVL))
SrcPolicy.setImm(SrcPolicy.getImm() | RISCVII::TAIL_AGNOSTIC);
}
@@ -631,7 +617,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
// so we don't need to handle a smaller source VL here. However, the
// user's VL may be larger
MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
- if (!isVLKnownLE(SrcVL, MI.getOperand(3)))
+ if (!RISCV::isVLKnownLE(SrcVL, MI.getOperand(3)))
return false;
// If the new passthru doesn't dominate Src, try to move Src so it does.
@@ -650,7 +636,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
// If MI was tail agnostic and the VL didn't increase, preserve it.
int64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
if ((MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) &&
- isVLKnownLE(MI.getOperand(3), SrcVL))
+ RISCV::isVLKnownLE(MI.getOperand(3), SrcVL))
Policy |= RISCVII::TAIL_AGNOSTIC;
Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c762643..83417e5 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -9863,7 +9863,7 @@ verifyNarrowIntegerArgs_Call(const SmallVectorImpl<ISD::OutputArg> &Outs,
if (CalleeFn != nullptr)
printFunctionArgExts(CalleeFn, errs());
else
- errs() << "-";
+ errs() << "-\n";
errs() << "Caller: ";
printFunctionArgExts(F, errs());
llvm_unreachable("");
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 53ed46f..f76f417 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -257,7 +257,7 @@ bool SystemZPassConfig::addInstSelector() {
}
bool SystemZPassConfig::addILPOpts() {
- addPass(&EarlyIfConverterID);
+ addPass(&EarlyIfConverterLegacyID);
if (EnableMachineCombinerPass)
addPass(&MachineCombinerID);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5b4b27c..0155409 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -858,6 +858,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FASIN , MVT::f80, Expand);
setOperationAction(ISD::FACOS , MVT::f80, Expand);
setOperationAction(ISD::FATAN , MVT::f80, Expand);
+ setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
setOperationAction(ISD::FSINH , MVT::f80, Expand);
setOperationAction(ISD::FCOSH , MVT::f80, Expand);
setOperationAction(ISD::FTANH , MVT::f80, Expand);
@@ -2562,6 +2563,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
{ISD::FACOS, ISD::STRICT_FACOS,
ISD::FASIN, ISD::STRICT_FASIN,
ISD::FATAN, ISD::STRICT_FATAN,
+ ISD::FATAN2, ISD::STRICT_FATAN2,
ISD::FCEIL, ISD::STRICT_FCEIL,
ISD::FCOS, ISD::STRICT_FCOS,
ISD::FCOSH, ISD::STRICT_FCOSH,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 8561658..12cd92e 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2856,6 +2856,13 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
return false;
}
+ // The stack frame of the caller cannot be replaced by the tail-callee one's
+ // if the function is required to preserve all the registers. Conservatively
+ // prevent tail optimization even if hypothetically all the registers are used
+ // for passing formal parameters or returning values.
+ if (CallerF.hasFnAttribute("no_caller_saved_registers"))
+ return false;
+
unsigned StackArgsSize = CCInfo.getStackSize();
// If the callee takes no arguments then go on to check the results of the
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index ceb87a6..4ba0ac1 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -536,7 +536,7 @@ bool X86PassConfig::addGlobalInstructionSelect() {
}
bool X86PassConfig::addILPOpts() {
- addPass(&EarlyIfConverterID);
+ addPass(&EarlyIfConverterLegacyID);
if (EnableMachineCombinerPass)
addPass(&MachineCombinerID);
addPass(createX86CmovConverterPass());
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 05fc6f1..bc9fd80 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -333,12 +333,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
// If using _except_handler4, the EHGuard contains: FramePtr xor Cookie.
if (UseStackGuard) {
Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
- Value *FrameAddr = Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(
- TheModule, Intrinsic::frameaddress,
- Builder.getPtrTy(
- TheModule->getDataLayout().getAllocaAddrSpace())),
- Builder.getInt32(0), "frameaddr");
+ Value *FrameAddr = Builder.CreateIntrinsic(
+ Intrinsic::frameaddress,
+ Builder.getPtrTy(TheModule->getDataLayout().getAllocaAddrSpace()),
+ Builder.getInt32(0), /*FMFSource=*/nullptr, "frameaddr");
Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
Builder.CreateStore(FrameAddrI32, EHGuardNode);
@@ -369,8 +367,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
}
Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
- return Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(TheModule, Intrinsic::x86_seh_lsda), F);
+ return Builder.CreateIntrinsic(Intrinsic::x86_seh_lsda, {}, F);
}
/// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
@@ -624,17 +621,13 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
// that it can recover the original frame pointer.
IRBuilder<> Builder(RegNode->getNextNode());
Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getPtrTy());
- Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
- TheModule, Intrinsic::x86_seh_ehregnode),
- {RegNodeI8});
+ Builder.CreateIntrinsic(Intrinsic::x86_seh_ehregnode, {}, {RegNodeI8});
if (EHGuardNode) {
IRBuilder<> Builder(EHGuardNode->getNextNode());
Value *EHGuardNodeI8 =
Builder.CreateBitCast(EHGuardNode, Builder.getPtrTy());
- Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
- TheModule, Intrinsic::x86_seh_ehguard),
- {EHGuardNodeI8});
+ Builder.CreateIntrinsic(Intrinsic::x86_seh_ehguard, {}, {EHGuardNodeI8});
}
// Calculate state numbers.
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index a7a01ca..3121659 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -145,9 +145,10 @@ public:
// function here in the meantime to decouple from that discussion.
Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id,
ArrayRef<Type *> Tys = {}) {
+ if (Tys.empty())
+ return Intrinsic::getDeclarationIfExists(M, Id);
auto *FT = Intrinsic::getType(M->getContext(), Id, Tys);
- return M->getFunction(Tys.empty() ? Intrinsic::getName(Id)
- : Intrinsic::getName(Id, Tys, M, FT));
+ return Intrinsic::getDeclarationIfExists(M, Id, Tys, FT);
}
class ExpandVariadics : public ModulePass {
diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index e36d524..eca36fb 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -186,9 +186,9 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) {
LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n");
Function *TypeCheckedLoadFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
- Function *TypeCheckedLoadRelativeFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+ Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+ &M, Intrinsic::type_checked_load_relative);
auto scan = [&](Function *CheckedLoadFunc) {
if (!CheckedLoadFunc)
diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index fd49b74..320fd89 100644
--- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -174,11 +174,11 @@ static bool splitGlobals(Module &M) {
// llvm.type.checked.load intrinsics, which indicates that splitting globals
// may be beneficial.
Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
Function *TypeCheckedLoadFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
- Function *TypeCheckedLoadRelativeFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+ Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+ &M, Intrinsic::type_checked_load_relative);
if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
(!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()) &&
(!TypeCheckedLoadRelativeFunc ||
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 519a4e9..3fcfc6a 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1970,7 +1970,7 @@ static void dropTypeTests(Module &M, Function &TypeTestFunc) {
bool LowerTypeTestsModule::lower() {
Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
if (DropTypeTests) {
if (TypeTestFunc)
@@ -1979,7 +1979,7 @@ bool LowerTypeTestsModule::lower() {
// except for in the case where we originally were performing ThinLTO but
// decided not to in the backend.
Function *PublicTypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::public_type_test));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
if (PublicTypeTestFunc)
dropTypeTests(M, *PublicTypeTestFunc);
if (TypeTestFunc || PublicTypeTestFunc) {
@@ -2002,7 +2002,7 @@ bool LowerTypeTestsModule::lower() {
return false;
Function *ICallBranchFunnelFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::icall_branch_funnel);
if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
(!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) &&
!ExportSummary && !ImportSummary)
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 9bf29c4..cd0e412 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -123,7 +123,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
};
if (Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test))) {
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test)) {
for (const Use &U : TypeTestFunc->uses()) {
auto CI = cast<CallInst>(U.getUser());
ExternalizeTypeId(CI, 1);
@@ -131,7 +131,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
}
if (Function *PublicTypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::public_type_test))) {
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test)) {
for (const Use &U : PublicTypeTestFunc->uses()) {
auto CI = cast<CallInst>(U.getUser());
ExternalizeTypeId(CI, 1);
@@ -139,15 +139,15 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
}
if (Function *TypeCheckedLoadFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) {
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load)) {
for (const Use &U : TypeCheckedLoadFunc->uses()) {
auto CI = cast<CallInst>(U.getUser());
ExternalizeTypeId(CI, 2);
}
}
- if (Function *TypeCheckedLoadRelativeFunc = M.getFunction(
- Intrinsic::getName(Intrinsic::type_checked_load_relative))) {
+ if (Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+ &M, Intrinsic::type_checked_load_relative)) {
for (const Use &U : TypeCheckedLoadRelativeFunc->uses()) {
auto CI = cast<CallInst>(U.getUser());
ExternalizeTypeId(CI, 2);
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 59f986b..45d3221 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -851,7 +851,7 @@ void llvm::updateVCallVisibilityInModule(
void llvm::updatePublicTypeTestCalls(Module &M,
bool WholeProgramVisibilityEnabledInLTO) {
Function *PublicTypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::public_type_test));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
if (!PublicTypeTestFunc)
return;
if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) {
@@ -2247,12 +2247,13 @@ bool DevirtModule::run() {
return false;
Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
Function *TypeCheckedLoadFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
- Function *TypeCheckedLoadRelativeFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
- Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+ Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+ &M, Intrinsic::type_checked_load_relative);
+ Function *AssumeFunc =
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::assume);
// Normally if there are no users of the devirtualization intrinsics in the
// module, this pass has nothing to do. But if we are exporting, we also need
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 64bee4a..c8407e8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3369,8 +3369,14 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
// We can convert this case to bitwise and, because both operands are used
// on the LHS, and as such poison from both will propagate.
if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd,
- /*IsLogical*/ false, Builder, Q))
+ /*IsLogical=*/false, Builder, Q)) {
+ // If RHS is still used, we should drop samesign flag.
+ if (IsLogical && RHS->hasSameSign() && !RHS->use_empty()) {
+ RHS->setSameSign(false);
+ addToWorklist(RHS);
+ }
return V;
+ }
if (Value *V = foldIsPowerOf2OrZero(LHS, RHS, IsAnd, Builder, *this))
return V;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7129499..18a6fdc 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1738,7 +1738,7 @@ Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
// Compute X & (C2 << Y).
Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
- return replaceOperand(Cmp, 0, NewAnd);
+ return new ICmpInst(Cmp.getPredicate(), NewAnd, Cmp.getOperand(1));
}
return nullptr;
@@ -1844,7 +1844,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
/*HasNUW=*/true),
One, Or->getName());
Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
- return replaceOperand(Cmp, 0, NewAnd);
+ return new ICmpInst(Cmp.getPredicate(), NewAnd, Cmp.getOperand(1));
}
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 8be2eee..ed44f05 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1448,6 +1448,7 @@ Instruction *InstCombinerImpl::foldSelectEqualityTest(SelectInst &Sel) {
m_c_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(X), m_Specific(Y))))
return nullptr;
+ cast<ICmpInst>(XeqY)->setSameSign(false);
return replaceInstUsesWith(Sel, XeqY);
}
@@ -1953,56 +1954,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
return &SI;
}
- // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
- // decomposeBitTestICmp() might help.
- if (TrueVal->getType()->isIntOrIntVectorTy()) {
- unsigned BitWidth =
- DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
- APInt MinSignedValue = APInt::getSignedMinValue(BitWidth);
- Value *X;
- const APInt *Y, *C;
- bool TrueWhenUnset;
- bool IsBitTest = false;
- if (ICmpInst::isEquality(Pred) &&
- match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) &&
- match(CmpRHS, m_Zero())) {
- IsBitTest = true;
- TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
- } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) {
- X = CmpLHS;
- Y = &MinSignedValue;
- IsBitTest = true;
- TrueWhenUnset = false;
- } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) {
- X = CmpLHS;
- Y = &MinSignedValue;
- IsBitTest = true;
- TrueWhenUnset = true;
- }
- if (IsBitTest) {
- Value *V = nullptr;
- // (X & Y) == 0 ? X : X ^ Y --> X & ~Y
- if (TrueWhenUnset && TrueVal == X &&
- match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
- V = Builder.CreateAnd(X, ~(*Y));
- // (X & Y) != 0 ? X ^ Y : X --> X & ~Y
- else if (!TrueWhenUnset && FalseVal == X &&
- match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
- V = Builder.CreateAnd(X, ~(*Y));
- // (X & Y) == 0 ? X ^ Y : X --> X | Y
- else if (TrueWhenUnset && FalseVal == X &&
- match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
- V = Builder.CreateOr(X, *Y);
- // (X & Y) != 0 ? X : X ^ Y --> X | Y
- else if (!TrueWhenUnset && TrueVal == X &&
- match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
- V = Builder.CreateOr(X, *Y);
-
- if (V)
- return replaceInstUsesWith(SI, V);
- }
- }
-
if (Instruction *V =
foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
return V;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 02d9fab..55e9903 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1865,11 +1865,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
if (UseCalls && ClOptimizeCallbacks) {
const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex);
- Module *M = IRB.GetInsertBlock()->getParent()->getParent();
- IRB.CreateCall(
- Intrinsic::getOrInsertDeclaration(M, Intrinsic::asan_check_memaccess),
- {IRB.CreatePointerCast(Addr, PtrTy),
- ConstantInt::get(Int32Ty, AccessInfo.Packed)});
+ IRB.CreateIntrinsic(Intrinsic::asan_check_memaccess, {},
+ {IRB.CreatePointerCast(Addr, PtrTy),
+ ConstantInt::get(Int32Ty, AccessInfo.Packed)});
return;
}
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 5ec4973..21d4d37d 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1025,7 +1025,6 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
insertShadowTagCheck(Ptr, InsertBefore, DTU, LI).TagMismatchTerm;
IRBuilder<> IRB(InsertBefore);
- Module *M = IRB.GetInsertBlock()->getParent()->getParent();
bool UseFixedShadowIntrinsic = false;
// The memaccess fixed shadow intrinsic is only supported on AArch64,
// which allows a 16-bit immediate to be left-shifted by 32.
@@ -1041,19 +1040,18 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
}
if (UseFixedShadowIntrinsic) {
- IRB.CreateCall(
- Intrinsic::getOrInsertDeclaration(
- M, UseShortGranules
- ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow
- : Intrinsic::hwasan_check_memaccess_fixedshadow),
+ IRB.CreateIntrinsic(
+ UseShortGranules
+ ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow
+ : Intrinsic::hwasan_check_memaccess_fixedshadow,
+ {},
{Ptr, ConstantInt::get(Int32Ty, AccessInfo),
ConstantInt::get(Int64Ty, Mapping.offset())});
} else {
- IRB.CreateCall(Intrinsic::getOrInsertDeclaration(
- M, UseShortGranules
- ? Intrinsic::hwasan_check_memaccess_shortgranules
- : Intrinsic::hwasan_check_memaccess),
- {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
+ IRB.CreateIntrinsic(
+ UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules
+ : Intrinsic::hwasan_check_memaccess,
+ {}, {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
}
}
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 8663710..43b8d5e 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -944,7 +944,7 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
// Find out virtual calls by looking at users of llvm.type.checked.load in
// that case.
Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
if (!TypeTestFunc || TypeTestFunc->use_empty())
return;
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 929c787..d7d809d 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -902,15 +902,15 @@ static bool needsRuntimeHookUnconditionally(const Triple &TT) {
/// Check if the module contains uses of any profiling intrinsics.
static bool containsProfilingIntrinsics(Module &M) {
auto containsIntrinsic = [&](int ID) {
- if (auto *F = M.getFunction(Intrinsic::getName(ID)))
+ if (auto *F = Intrinsic::getDeclarationIfExists(&M, ID))
return !F->use_empty();
return false;
};
- return containsIntrinsic(llvm::Intrinsic::instrprof_cover) ||
- containsIntrinsic(llvm::Intrinsic::instrprof_increment) ||
- containsIntrinsic(llvm::Intrinsic::instrprof_increment_step) ||
- containsIntrinsic(llvm::Intrinsic::instrprof_timestamp) ||
- containsIntrinsic(llvm::Intrinsic::instrprof_value_profile);
+ return containsIntrinsic(Intrinsic::instrprof_cover) ||
+ containsIntrinsic(Intrinsic::instrprof_increment) ||
+ containsIntrinsic(Intrinsic::instrprof_increment_step) ||
+ containsIntrinsic(Intrinsic::instrprof_timestamp) ||
+ containsIntrinsic(Intrinsic::instrprof_value_profile);
}
bool InstrLowerer::lower() {
diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
index bbe0f4c..4b653a8 100644
--- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp
+++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
@@ -110,8 +110,7 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) {
Instruction *ThenTerm =
SplitBlockAndInsertIfThen(Test, Call, false, VeryUnlikelyWeights);
Builder.SetInsertPoint(ThenTerm);
- Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap));
+ Builder.CreateIntrinsic(Intrinsic::debugtrap, {}, {});
++NumKCFIChecks;
}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index dbe908b..919660e 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -918,8 +918,8 @@ void FunctionInstrumenter::instrument() {
IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
// llvm.instrprof.cover(i8* <name>, i64 <hash>, i32 <num-counters>,
// i32 <index>)
- Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_cover),
+ Builder.CreateIntrinsic(
+ Intrinsic::instrprof_cover, {},
{NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
return;
}
@@ -971,10 +971,10 @@ void FunctionInstrumenter::instrument() {
IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
// llvm.instrprof.timestamp(i8* <name>, i64 <hash>, i32 <num-counters>,
// i32 <index>)
- Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_timestamp),
- {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
- Builder.getInt32(I)});
+ Builder.CreateIntrinsic(Intrinsic::instrprof_timestamp, {},
+ {NormalizedNamePtr, CFGHash,
+ Builder.getInt32(NumCounters),
+ Builder.getInt32(I)});
I += PGOBlockCoverage ? 8 : 1;
}
@@ -984,12 +984,12 @@ void FunctionInstrumenter::instrument() {
"Cannot get the Instrumentation point");
// llvm.instrprof.increment(i8* <name>, i64 <hash>, i32 <num-counters>,
// i32 <index>)
- Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
- &M, PGOBlockCoverage
- ? Intrinsic::instrprof_cover
- : Intrinsic::instrprof_increment),
- {NormalizedNamePtr, CFGHash,
- Builder.getInt32(NumCounters), Builder.getInt32(I++)});
+ Builder.CreateIntrinsic(PGOBlockCoverage ? Intrinsic::instrprof_cover
+ : Intrinsic::instrprof_increment,
+ {},
+ {NormalizedNamePtr, CFGHash,
+ Builder.getInt32(NumCounters),
+ Builder.getInt32(I++)});
}
// Now instrument select instructions:
@@ -1726,10 +1726,10 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
auto *NormalizedFuncNameVarPtr =
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
FuncNameVar, PointerType::get(M->getContext(), 0));
- Builder.CreateCall(
- Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step),
- {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
- Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
+ Builder.CreateIntrinsic(Intrinsic::instrprof_increment_step, {},
+ {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
+ Builder.getInt32(TotalNumCtrs),
+ Builder.getInt32(*CurCtrIdx), Step});
++(*CurCtrIdx);
}
@@ -1916,7 +1916,6 @@ static bool InstrumentAllFunctions(
std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
collectComdatMembers(M, ComdatMembers);
- bool AnythingInstrumented = false;
for (auto &F : M) {
if (skipPGOGen(F))
continue;
@@ -1926,9 +1925,8 @@ static bool InstrumentAllFunctions(
FunctionInstrumenter FI(M, F, TLI, ComdatMembers, BPI, BFI,
InstrumentationType);
FI.instrument();
- AnythingInstrumented = true;
}
- return AnythingInstrumented;
+ return true;
}
PreservedAnalyses
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 388addf..915dc70 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -572,9 +572,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI());
Value *ReturnAddress =
- IRB.CreateCall(Intrinsic::getOrInsertDeclaration(
- F.getParent(), Intrinsic::returnaddress),
- IRB.getInt32(0));
+ IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
IRB.CreateCall(TsanFuncEntry, ReturnAddress);
EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 98ecbe4..9d23c89 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -398,9 +398,9 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
}
Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
- if (ConvertedInsts.contains(I))
+ if (auto It = ConvertedInsts.find(I); It != ConvertedInsts.end())
// Already converted this instruction.
- return ConvertedInsts[I];
+ return It->second;
SmallVector<Value*,4> NewOperands;
for (Value *V : I->operands()) {
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index e7ff2a1..7fa9f42 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -980,11 +980,11 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
PreservedAnalyses GuardWideningPass::run(Function &F,
FunctionAnalysisManager &AM) {
// Avoid requesting analyses if there are no guards or widenable conditions.
- auto *GuardDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::experimental_guard);
bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
- auto *WCDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+ auto *WCDecl = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::experimental_widenable_condition);
bool HasWidenableConditions = WCDecl && !WCDecl->use_empty();
if (!HasIntrinsicGuards && !HasWidenableConditions)
return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 2668305..ad68fc1 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -598,8 +598,8 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
LoopInfo *LI) {
SmallVector<WideIVInfo, 8> WideIVs;
- auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ L->getBlocks()[0]->getModule(), Intrinsic::experimental_guard);
bool HasGuards = GuardDecl && !GuardDecl->use_empty();
SmallVector<PHINode *, 8> LoopPhis;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 7a0b661..11fdc394 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -296,8 +296,8 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_,
DTU = std::move(DTU_);
BFI = BFI_;
BPI = BPI_;
- auto *GuardDecl = F->getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ F->getParent(), Intrinsic::experimental_guard);
HasGuards = GuardDecl && !GuardDecl->use_empty();
// Reduce the number of instructions duplicated when optimizing strictly for
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 209b083..31694ad 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -1193,10 +1193,10 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
// There is nothing to do if the module doesn't use guards
auto *GuardDecl =
- M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+ Intrinsic::getDeclarationIfExists(M, Intrinsic::experimental_guard);
bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
- auto *WCDecl = M->getFunction(
- Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+ auto *WCDecl = Intrinsic::getDeclarationIfExists(
+ M, Intrinsic::experimental_widenable_condition);
bool HasWidenableConditions =
PredicateWidenableBranchGuards && WCDecl && !WCDecl->use_empty();
if (!HasIntrinsicGuards && !HasWidenableConditions)
diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index ce35349..5f3e612 100644
--- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -27,8 +27,8 @@ using namespace llvm;
static bool lowerGuardIntrinsic(Function &F) {
// Check if we can cheaply rule out the possibility of not having any work to
// do.
- auto *GuardDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::experimental_guard);
if (!GuardDecl || GuardDecl->use_empty())
return false;
diff --git a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
index 3c977b8..ea2b419 100644
--- a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
@@ -26,8 +26,8 @@ using namespace llvm;
static bool lowerWidenableCondition(Function &F) {
// Check if we can cheaply rule out the possibility of not having any work to
// do.
- auto *WCDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+ auto *WCDecl = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::experimental_widenable_condition);
if (!WCDecl || WCDecl->use_empty())
return false;
diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index b9f88ba..948466c 100644
--- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -56,8 +56,8 @@ static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) {
static bool explicifyGuards(Function &F) {
// Check if we can cheaply rule out the possibility of not having any work to
// do.
- auto *GuardDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::experimental_guard);
if (!GuardDecl || GuardDecl->use_empty())
return false;
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index f3f5ffb..aa3cbc5 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2920,8 +2920,8 @@ static bool collectUnswitchCandidates(
// Whether or not we should also collect guards in the loop.
bool CollectGuards = false;
if (UnswitchGuards) {
- auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
+ auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+ L.getHeader()->getParent()->getParent(), Intrinsic::experimental_guard);
if (GuardDecl && !GuardDecl->use_empty())
CollectGuards = true;
}
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 101d605..c65710e 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -630,10 +630,7 @@ private:
}
// Add U as additional user of V.
- void addAdditionalUser(Value *V, User *U) {
- auto Iter = AdditionalUsers.insert({V, {}});
- Iter.first->second.insert(U);
- }
+ void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); }
// Mark I's users as changed, including AdditionalUsers.
void markUsersAsChanged(Value *I) {
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
index 20df9e3..66fac08 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
@@ -112,21 +112,23 @@ template void SeedContainer::insert<LoadInst>(LoadInst *);
template void SeedContainer::insert<StoreInst>(StoreInst *);
#ifndef NDEBUG
-void SeedContainer::dump() const {
+void SeedContainer::print(raw_ostream &OS) const {
for (const auto &Pair : Bundles) {
auto [I, Ty, Opc] = Pair.first;
const auto &SeedsVec = Pair.second;
std::string RefType = dyn_cast<LoadInst>(I) ? "Load"
: dyn_cast<StoreInst>(I) ? "Store"
: "Other";
- dbgs() << "[Inst=" << *I << " Ty=" << Ty << " " << RefType << "]\n";
+ OS << "[Inst=" << *I << " Ty=" << Ty << " " << RefType << "]\n";
for (const auto &SeedPtr : SeedsVec) {
- SeedPtr->dump(dbgs());
- dbgs() << "\n";
+ SeedPtr->dump(OS);
+ OS << "\n";
}
}
- dbgs() << "\n";
+ OS << "\n";
}
+
+LLVM_DUMP_METHOD void SeedContainer::dump() const { print(dbgs()); }
#endif // NDEBUG
} // namespace llvm::sandboxir
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 20d4700..b6b4998 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -526,11 +526,11 @@ define void @frem() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16BF16 = frem <16 x bfloat> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = frem <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = frem <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = frem <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV8BF16 = frem <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV16BF16 = frem <vscale x 16 x bfloat> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV1BF16 = frem <vscale x 1 x bfloat> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV2BF16 = frem <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV4BF16 = frem <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV8BF16 = frem <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV16BF16 = frem <vscale x 16 x bfloat> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = frem <2 x float> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = frem <4 x float> undef, undef
@@ -593,37 +593,21 @@ define void @frem() {
}
define void @frem_f16() {
-; ZVFH-LABEL: 'frem_f16'
-; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV8F16 = frem <vscale x 8 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
-; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; ZVFHMIN-LABEL: 'frem_f16'
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = frem <vscale x 8 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'frem_f16'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV8F16 = frem <vscale x 8 x half> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%F16 = frem half undef, undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
index 78acba8..efe17f2 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
@@ -1,17 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
define void @sqrt() {
; CHECK-LABEL: 'sqrt'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.sqrt.f32(float undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -33,15 +34,15 @@ define void @sqrt() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
- call half @llvm.sqrt.f16(half undef)
- call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
- call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
- call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
- call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
- call <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half> undef)
- call <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half> undef)
- call <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half> undef)
- call <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half> undef)
+ call bfloat @llvm.sqrt.bf16(bfloat undef)
+ call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+ call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+ call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+ call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.sqrt.nvx2bf16(<vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.sqrt.nvx4bf16(<vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.sqrt.nvx8bf16(<vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.sqrt.nvx16bf16(<vscale x 16 x bfloat> undef)
call float @llvm.sqrt.f32(float undef)
call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -64,58 +65,74 @@ define void @sqrt() {
ret void
}
-declare half @llvm.sqrt.f16(half)
-declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
-declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
-declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
-declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
-declare <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half>)
-declare float @llvm.sqrt.f32(float)
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
-declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.sqrt.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.sqrt.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.sqrt.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.sqrt.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.sqrt.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.sqrt.f64(double)
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
-declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
-declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
-declare <16 x double> @llvm.sqrt.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.sqrt.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.sqrt.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.sqrt.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.sqrt.nvx8f64(<vscale x 8 x double>)
+define void @sqrt_f16() {
+; CHECK-LABEL: 'sqrt_f16'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.sqrt.f16(half undef)
+ call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+ call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+ call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+ call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+ call <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half> undef)
+ call <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half> undef)
+ ret void
+}
define void @pow() {
; CHECK-LABEL: 'pow'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.pow.f32(float undef, float undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.pow.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.pow.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.pow.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.pow.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.pow.f64(double undef, double undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.pow.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.pow.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.pow.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.pow.bf16(bfloat undef, bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.pow.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.pow.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.pow.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.pow.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.pow.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.pow.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.pow.f32(float undef, float undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.pow.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.pow.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.pow.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.pow.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.pow.f64(double undef, double undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.pow.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.pow.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.pow.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ call bfloat @llvm.pow.bf16(bfloat undef, bfloat undef)
+ call <2 x bfloat> @llvm.pow.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+ call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+ call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+ call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+ call <vscale x 1 x bfloat> @llvm.pow.nvx1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.pow.nvx2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.pow.nvx4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.pow.nvx8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.pow.nvx16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
call float @llvm.pow.f32(float undef, float undef)
call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
@@ -138,22 +155,42 @@ define void @pow() {
ret void
}
-declare float @llvm.pow.f32(float, float)
-declare <2 x float> @llvm.pow.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.pow.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.pow.v16f32(<16 x float>, <16 x float>)
-declare <vscale x 1 x float> @llvm.pow.nvx1f32(<vscale x 1 x float>, <vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.pow.nvx2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.pow.nvx4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.pow.nvx8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.pow.nvx16f32(<vscale x 16 x float>, <vscale x 16 x float>)
-declare double @llvm.pow.f64(double, double)
-declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.pow.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.pow.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.pow.v16f64(<16 x double>, <16 x double>)
-declare <vscale x 1 x double> @llvm.pow.nvx1f64(<vscale x 1 x double>, <vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.pow.nvx2f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.pow.nvx4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.pow.nvx8f64(<vscale x 8 x double>, <vscale x 8 x double>)
+define void @pow_f16() {
+; ZVFH-LABEL: 'pow_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.pow.f16(half undef, half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.pow.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.pow.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.pow.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.pow.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x half> @llvm.pow.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'pow_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.pow.f16(half undef, half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.pow.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.pow.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.pow.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.pow.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x half> @llvm.pow.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.pow.f16(half undef, half undef)
+ call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+ call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+ call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+ call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+ call <vscale x 1 x half> @llvm.pow.nvx1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+ call <vscale x 2 x half> @llvm.pow.nvx2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.pow.nvx4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.pow.nvx8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+ call <vscale x 16 x half> @llvm.pow.nvx16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+ ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
index af77911..34d6c93 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
@@ -1,29 +1,50 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
define void @sin() {
; CHECK-LABEL: 'sin'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.sin.f32(float undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.sin.v8f32(<8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.sin.v16f32(<16 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.sin.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.sin.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.sin.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.sin.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.sin.f64(double undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.sin.v2f64(<2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.sin.v4f64(<4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.sin.v8f64(<8 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.sin.v16f64(<16 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.sin.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.sin.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.sin.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.sin.bf16(bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.sin.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.sin.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.sin.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.sin.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.sin.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.sin.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.sin.f32(float undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.sin.v8f32(<8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.sin.v16f32(<16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.sin.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.sin.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.sin.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.sin.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.sin.f64(double undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.sin.v2f64(<2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.sin.v4f64(<4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.sin.v8f64(<8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.sin.v16f64(<16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.sin.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.sin.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.sin.nxv8f64(<vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ call bfloat @llvm.sin.bf16(bfloat undef)
+ call <2 x bfloat> @llvm.sin.v2bf16(<2 x bfloat> undef)
+ call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef)
+ call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef)
+ call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef)
+ call <vscale x 1 x bfloat> @llvm.sin.nvx1bf16(<vscale x 1 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.sin.nvx2bf16(<vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.sin.nvx4bf16(<vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.sin.nvx8bf16(<vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.sin.nvx16bf16(<vscale x 16 x bfloat> undef)
call float @llvm.sin.f32(float undef)
call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
@@ -46,29 +67,86 @@ define void @sin() {
ret void
}
+define void @sin_f16() {
+; ZVFH-LABEL: 'sin_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sin.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.sin.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.sin.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.sin.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.sin.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'sin_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.sin.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.sin.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.sin.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.sin.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.sin.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.sin.f16(half undef)
+ call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+ call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+ call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+ call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+ call <vscale x 1 x half> @llvm.sin.nvx1f16(<vscale x 1 x half> undef)
+ call <vscale x 2 x half> @llvm.sin.nvx2f16(<vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.sin.nvx4f16(<vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.sin.nvx8f16(<vscale x 8 x half> undef)
+ ret void
+}
+
define void @cos() {
; CHECK-LABEL: 'cos'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.cos.f32(float undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.cos.v8f32(<8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.cos.v16f32(<16 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.cos.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.cos.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.cos.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.cos.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.cos.f64(double undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.cos.v2f64(<2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.cos.v4f64(<4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.cos.v8f64(<8 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.cos.v16f64(<16 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.cos.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.cos.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.cos.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.cos.bf16(bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.cos.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.cos.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.cos.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.cos.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.cos.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.cos.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.cos.f32(float undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.cos.v8f32(<8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.cos.v16f32(<16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.cos.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.cos.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.cos.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.cos.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.cos.f64(double undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.cos.v2f64(<2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.cos.v4f64(<4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.cos.v8f64(<8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.cos.v16f64(<16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.cos.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.cos.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.cos.nxv8f64(<vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ call bfloat @llvm.cos.bf16(bfloat undef)
+ call <2 x bfloat> @llvm.cos.v2bf16(<2 x bfloat> undef)
+ call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef)
+ call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef)
+ call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef)
+ call <vscale x 1 x bfloat> @llvm.cos.nvx1bf16(<vscale x 1 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.cos.nvx2bf16(<vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.cos.nvx4bf16(<vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.cos.nvx8bf16(<vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.cos.nvx16bf16(<vscale x 16 x bfloat> undef)
call float @llvm.cos.f32(float undef)
call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
@@ -91,29 +169,86 @@ define void @cos() {
ret void
}
+define void @cos_f16() {
+; ZVFH-LABEL: 'cos_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.cos.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.cos.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.cos.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.cos.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.cos.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'cos_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.cos.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.cos.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.cos.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.cos.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.cos.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.cos.f16(half undef)
+ call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+ call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+ call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+ call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+ call <vscale x 1 x half> @llvm.cos.nvx1f16(<vscale x 1 x half> undef)
+ call <vscale x 2 x half> @llvm.cos.nvx2f16(<vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.cos.nvx4f16(<vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.cos.nvx8f16(<vscale x 8 x half> undef)
+ ret void
+}
+
define void @exp() {
; CHECK-LABEL: 'exp'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.exp.f32(float undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.exp.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.exp.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.exp.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.exp.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.exp.f64(double undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.exp.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.exp.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.exp.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.exp.bf16(bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp.f32(float undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.exp.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.exp.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.exp.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.exp.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.exp.f64(double undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.exp.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.exp.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.exp.nxv8f64(<vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ call bfloat @llvm.exp.bf16(bfloat undef)
+ call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+ call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+ call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+ call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+ call <vscale x 1 x bfloat> @llvm.exp.nvx1bf16(<vscale x 1 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.exp.nvx2bf16(<vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.exp.nvx4bf16(<vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.exp.nvx8bf16(<vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.exp.nvx16bf16(<vscale x 16 x bfloat> undef)
call float @llvm.exp.f32(float undef)
call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
@@ -136,29 +271,86 @@ define void @exp() {
ret void
}
+define void @exp_f16() {
+; ZVFH-LABEL: 'exp_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.exp.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'exp_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.exp.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.exp.f16(half undef)
+ call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+ call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+ call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+ call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+ call <vscale x 1 x half> @llvm.exp.nvx1f16(<vscale x 1 x half> undef)
+ call <vscale x 2 x half> @llvm.exp.nvx2f16(<vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.exp.nvx4f16(<vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.exp.nvx8f16(<vscale x 8 x half> undef)
+ ret void
+}
+
define void @exp2() {
; CHECK-LABEL: 'exp2'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.exp2.f32(float undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.exp2.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.exp2.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.exp2.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.exp2.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.exp2.f64(double undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.exp2.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.exp2.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.exp2.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp2.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp2.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp2.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp2.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp2.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp2.f32(float undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.exp2.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.exp2.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.exp2.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.exp2.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.exp2.f64(double undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.exp2.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.exp2.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.exp2.nxv8f64(<vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ call bfloat @llvm.exp2.bf16(bfloat undef)
+ call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+ call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+ call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+ call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+ call <vscale x 1 x bfloat> @llvm.exp2.nvx1bf16(<vscale x 1 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.exp2.nvx2bf16(<vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.exp2.nvx4bf16(<vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.exp2.nvx8bf16(<vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.exp2.nvx16bf16(<vscale x 16 x bfloat> undef)
call float @llvm.exp2.f32(float undef)
call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
@@ -181,29 +373,86 @@ define void @exp2() {
ret void
}
+define void @exp2_f16() {
+; ZVFH-LABEL: 'exp2_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.exp2.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'exp2_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.exp2.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.exp2.f16(half undef)
+ call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+ call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+ call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+ call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+ call <vscale x 1 x half> @llvm.exp2.nvx1f16(<vscale x 1 x half> undef)
+ call <vscale x 2 x half> @llvm.exp2.nvx2f16(<vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.exp2.nvx4f16(<vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.exp2.nvx8f16(<vscale x 8 x half> undef)
+ ret void
+}
+
define void @log() {
; CHECK-LABEL: 'log'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log.f32(float undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log.f64(double undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log.bf16(bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log.f32(float undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log.f64(double undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log.nxv8f64(<vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ call bfloat @llvm.log.bf16(bfloat undef)
+ call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+ call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+ call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+ call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+ call <vscale x 1 x bfloat> @llvm.log.nvx1bf16(<vscale x 1 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.log.nvx2bf16(<vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.log.nvx4bf16(<vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.log.nvx8bf16(<vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.log.nvx16bf16(<vscale x 16 x bfloat> undef)
call float @llvm.log.f32(float undef)
call <2 x float> @llvm.log.v2f32(<2 x float> undef)
call <4 x float> @llvm.log.v4f32(<4 x float> undef)
@@ -226,29 +475,86 @@ define void @log() {
ret void
}
+define void @log_f16() {
+; ZVFH-LABEL: 'log_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.log.f16(half undef)
+ call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+ call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+ call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+ call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+ call <vscale x 1 x half> @llvm.log.nvx1f16(<vscale x 1 x half> undef)
+ call <vscale x 2 x half> @llvm.log.nvx2f16(<vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.log.nvx4f16(<vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.log.nvx8f16(<vscale x 8 x half> undef)
+ ret void
+}
+
define void @log10() {
; CHECK-LABEL: 'log10'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log10.f32(float undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log10.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log10.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log10.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log10.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log10.f64(double undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log10.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log10.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log10.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log10.bf16(bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log10.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log10.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log10.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log10.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log10.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log10.f32(float undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log10.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log10.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log10.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log10.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log10.f64(double undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log10.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log10.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log10.nxv8f64(<vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ call bfloat @llvm.log10.bf16(bfloat undef)
+ call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+ call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+ call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+ call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+ call <vscale x 1 x bfloat> @llvm.log10.nvx1bf16(<vscale x 1 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.log10.nvx2bf16(<vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.log10.nvx4bf16(<vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.log10.nvx8bf16(<vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.log10.nvx16bf16(<vscale x 16 x bfloat> undef)
call float @llvm.log10.f32(float undef)
call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
@@ -271,29 +577,86 @@ define void @log10() {
ret void
}
+define void @log10_f16() {
+; ZVFH-LABEL: 'log10_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log10.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log10.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log10.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log10.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log10.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log10_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log10.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log10.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log10.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log10.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log10.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.log10.f16(half undef)
+ call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+ call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+ call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+ call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+ call <vscale x 1 x half> @llvm.log10.nvx1f16(<vscale x 1 x half> undef)
+ call <vscale x 2 x half> @llvm.log10.nvx2f16(<vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.log10.nvx4f16(<vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.log10.nvx8f16(<vscale x 8 x half> undef)
+ ret void
+}
+
define void @log2() {
; CHECK-LABEL: 'log2'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log2.f32(float undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log2.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log2.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log2.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log2.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log2.f64(double undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log2.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log2.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log2.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log2.bf16(bfloat undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log2.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log2.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log2.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log2.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log2.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log2.f32(float undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log2.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log2.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log2.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log2.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log2.f64(double undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log2.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log2.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log2.nxv8f64(<vscale x 8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
+ call bfloat @llvm.log2.bf16(bfloat undef)
+ call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+ call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+ call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+ call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+ call <vscale x 1 x bfloat> @llvm.log2.nvx1bf16(<vscale x 1 x bfloat> undef)
+ call <vscale x 2 x bfloat> @llvm.log2.nvx2bf16(<vscale x 2 x bfloat> undef)
+ call <vscale x 4 x bfloat> @llvm.log2.nvx4bf16(<vscale x 4 x bfloat> undef)
+ call <vscale x 8 x bfloat> @llvm.log2.nvx8bf16(<vscale x 8 x bfloat> undef)
+ call <vscale x 16 x bfloat> @llvm.log2.nvx16bf16(<vscale x 16 x bfloat> undef)
call float @llvm.log2.f32(float undef)
call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
@@ -316,142 +679,40 @@ define void @log2() {
ret void
}
-declare float @llvm.sin.f32(float)
-declare <2 x float> @llvm.sin.v2f32(<2 x float>)
-declare <4 x float> @llvm.sin.v4f32(<4 x float>)
-declare <8 x float> @llvm.sin.v8f32(<8 x float>)
-declare <16 x float> @llvm.sin.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.sin.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.sin.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.sin.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.sin.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.sin.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.sin.f64(double)
-declare <2 x double> @llvm.sin.v2f64(<2 x double>)
-declare <4 x double> @llvm.sin.v4f64(<4 x double>)
-declare <8 x double> @llvm.sin.v8f64(<8 x double>)
-declare <16 x double> @llvm.sin.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.sin.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.sin.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.sin.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.sin.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.cos.f32(float)
-declare <2 x float> @llvm.cos.v2f32(<2 x float>)
-declare <4 x float> @llvm.cos.v4f32(<4 x float>)
-declare <8 x float> @llvm.cos.v8f32(<8 x float>)
-declare <16 x float> @llvm.cos.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.cos.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.cos.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.cos.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.cos.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.cos.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.cos.f64(double)
-declare <2 x double> @llvm.cos.v2f64(<2 x double>)
-declare <4 x double> @llvm.cos.v4f64(<4 x double>)
-declare <8 x double> @llvm.cos.v8f64(<8 x double>)
-declare <16 x double> @llvm.cos.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.cos.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.cos.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.cos.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.cos.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.exp.f32(float)
-declare <2 x float> @llvm.exp.v2f32(<2 x float>)
-declare <4 x float> @llvm.exp.v4f32(<4 x float>)
-declare <8 x float> @llvm.exp.v8f32(<8 x float>)
-declare <16 x float> @llvm.exp.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.exp.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.exp.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.exp.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.exp.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.exp.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.exp.f64(double)
-declare <2 x double> @llvm.exp.v2f64(<2 x double>)
-declare <4 x double> @llvm.exp.v4f64(<4 x double>)
-declare <8 x double> @llvm.exp.v8f64(<8 x double>)
-declare <16 x double> @llvm.exp.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.exp.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.exp.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.exp.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.exp.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.exp2.f32(float)
-declare <2 x float> @llvm.exp2.v2f32(<2 x float>)
-declare <4 x float> @llvm.exp2.v4f32(<4 x float>)
-declare <8 x float> @llvm.exp2.v8f32(<8 x float>)
-declare <16 x float> @llvm.exp2.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.exp2.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.exp2.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.exp2.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.exp2.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.exp2.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.exp2.f64(double)
-declare <2 x double> @llvm.exp2.v2f64(<2 x double>)
-declare <4 x double> @llvm.exp2.v4f64(<4 x double>)
-declare <8 x double> @llvm.exp2.v8f64(<8 x double>)
-declare <16 x double> @llvm.exp2.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.exp2.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.exp2.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.exp2.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.exp2.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.log.f32(float)
-declare <2 x float> @llvm.log.v2f32(<2 x float>)
-declare <4 x float> @llvm.log.v4f32(<4 x float>)
-declare <8 x float> @llvm.log.v8f32(<8 x float>)
-declare <16 x float> @llvm.log.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log.f64(double)
-declare <2 x double> @llvm.log.v2f64(<2 x double>)
-declare <4 x double> @llvm.log.v4f64(<4 x double>)
-declare <8 x double> @llvm.log.v8f64(<8 x double>)
-declare <16 x double> @llvm.log.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.log10.f32(float)
-declare <2 x float> @llvm.log10.v2f32(<2 x float>)
-declare <4 x float> @llvm.log10.v4f32(<4 x float>)
-declare <8 x float> @llvm.log10.v8f32(<8 x float>)
-declare <16 x float> @llvm.log10.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log10.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log10.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log10.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log10.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log10.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log10.f64(double)
-declare <2 x double> @llvm.log10.v2f64(<2 x double>)
-declare <4 x double> @llvm.log10.v4f64(<4 x double>)
-declare <8 x double> @llvm.log10.v8f64(<8 x double>)
-declare <16 x double> @llvm.log10.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log10.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log10.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log10.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log10.nvx8f64(<vscale x 8 x double>)
+define void @log2_f16() {
+; ZVFH-LABEL: 'log2_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log2.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log2_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log2.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call half @llvm.log2.f16(half undef)
+ call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+ call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+ call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+ call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+ call <vscale x 1 x half> @llvm.log2.nvx1f16(<vscale x 1 x half> undef)
+ call <vscale x 2 x half> @llvm.log2.nvx2f16(<vscale x 2 x half> undef)
+ call <vscale x 4 x half> @llvm.log2.nvx4f16(<vscale x 4 x half> undef)
+ call <vscale x 8 x half> @llvm.log2.nvx8f16(<vscale x 8 x half> undef)
+ ret void
+}
-declare float @llvm.log2.f32(float)
-declare <2 x float> @llvm.log2.v2f32(<2 x float>)
-declare <4 x float> @llvm.log2.v4f32(<4 x float>)
-declare <8 x float> @llvm.log2.v8f32(<8 x float>)
-declare <16 x float> @llvm.log2.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log2.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log2.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log2.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log2.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log2.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log2.f64(double)
-declare <2 x double> @llvm.log2.v2f64(<2 x double>)
-declare <4 x double> @llvm.log2.v4f64(<4 x double>)
-declare <8 x double> @llvm.log2.v8f64(<8 x double>)
-declare <16 x double> @llvm.log2.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log2.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log2.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log2.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log2.nvx8f64(<vscale x 8 x double>)
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
index 2bf1e5d..ef17d8d 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
@@ -414,4 +414,184 @@ define void @select_of_constants() {
ret void
}
+define void @vp_merge() {
+; CHECK-LABEL: 'vp_merge'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %1 = call <1 x i1> @llvm.vp.merge.v1i1(<1 x i1> undef, <1 x i1> undef, <1 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x i1> @llvm.vp.merge.v2i1(<2 x i1> undef, <2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x i1> @llvm.vp.merge.v4i1(<4 x i1> undef, <4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> undef, <8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> undef, <16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %6 = call <32 x i1> @llvm.vp.merge.v32i1(<32 x i1> undef, <32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 1 x i1> @llvm.vp.merge.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call <vscale x 2 x i1> @llvm.vp.merge.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call <vscale x 8 x i1> @llvm.vp.merge.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = call <vscale x 16 x i1> @llvm.vp.merge.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = call <vscale x 32 x i1> @llvm.vp.merge.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <1 x i8> @llvm.vp.merge.v1i8(<1 x i1> undef, <1 x i8> undef, <1 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = call <2 x i8> @llvm.vp.merge.v2i8(<2 x i1> undef, <2 x i8> undef, <2 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = call <4 x i8> @llvm.vp.merge.v4i8(<4 x i1> undef, <4 x i8> undef, <4 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = call <8 x i8> @llvm.vp.merge.v8i8(<8 x i1> undef, <8 x i8> undef, <8 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = call <16 x i8> @llvm.vp.merge.v16i8(<16 x i1> undef, <16 x i8> undef, <16 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call <32 x i8> @llvm.vp.merge.v32i8(<32 x i1> undef, <32 x i8> undef, <32 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> undef, <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> undef, <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> undef, <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> undef, <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %24 = call <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1> undef, <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %25 = call <1 x i16> @llvm.vp.merge.v1i16(<1 x i1> undef, <1 x i16> undef, <1 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = call <2 x i16> @llvm.vp.merge.v2i16(<2 x i1> undef, <2 x i16> undef, <2 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = call <4 x i16> @llvm.vp.merge.v4i16(<4 x i1> undef, <4 x i16> undef, <4 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i16> @llvm.vp.merge.v8i16(<8 x i1> undef, <8 x i16> undef, <8 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call <16 x i16> @llvm.vp.merge.v16i16(<16 x i1> undef, <16 x i16> undef, <16 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %30 = call <32 x i16> @llvm.vp.merge.v32i16(<32 x i1> undef, <32 x i16> undef, <32 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %31 = call <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1> undef, <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %32 = call <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1> undef, <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> undef, <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = call <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %36 = call <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1> undef, <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %37 = call <1 x i32> @llvm.vp.merge.v1i32(<1 x i1> undef, <1 x i32> undef, <1 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %38 = call <2 x i32> @llvm.vp.merge.v2i32(<2 x i1> undef, <2 x i32> undef, <2 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %39 = call <4 x i32> @llvm.vp.merge.v4i32(<4 x i1> undef, <4 x i32> undef, <4 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %40 = call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> undef, <8 x i32> undef, <8 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %41 = call <16 x i32> @llvm.vp.merge.v16i32(<16 x i1> undef, <16 x i32> undef, <16 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %42 = call <32 x i32> @llvm.vp.merge.v32i32(<32 x i1> undef, <32 x i32> undef, <32 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %43 = call <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %44 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %45 = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %46 = call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %47 = call <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %48 = call <vscale x 32 x i32> @llvm.vp.merge.nxv32i32(<vscale x 32 x i1> undef, <vscale x 32 x i32> undef, <vscale x 32 x i32> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %49 = call <1 x i64> @llvm.vp.merge.v1i64(<1 x i1> undef, <1 x i64> undef, <1 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %50 = call <2 x i64> @llvm.vp.merge.v2i64(<2 x i1> undef, <2 x i64> undef, <2 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %51 = call <4 x i64> @llvm.vp.merge.v4i64(<4 x i1> undef, <4 x i64> undef, <4 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %52 = call <8 x i64> @llvm.vp.merge.v8i64(<8 x i1> undef, <8 x i64> undef, <8 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %53 = call <16 x i64> @llvm.vp.merge.v16i64(<16 x i1> undef, <16 x i64> undef, <16 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %54 = call <32 x i64> @llvm.vp.merge.v32i64(<32 x i1> undef, <32 x i64> undef, <32 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %55 = call <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1> undef, <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %56 = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %57 = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %58 = call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %59 = call <vscale x 16 x i64> @llvm.vp.merge.nxv16i64(<vscale x 16 x i1> undef, <vscale x 16 x i64> undef, <vscale x 16 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %60 = call <vscale x 32 x i64> @llvm.vp.merge.nxv32i64(<vscale x 32 x i1> undef, <vscale x 32 x i64> undef, <vscale x 32 x i64> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %61 = call <1 x float> @llvm.vp.merge.v1f32(<1 x i1> undef, <1 x float> undef, <1 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %62 = call <2 x float> @llvm.vp.merge.v2f32(<2 x i1> undef, <2 x float> undef, <2 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %63 = call <4 x float> @llvm.vp.merge.v4f32(<4 x i1> undef, <4 x float> undef, <4 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %64 = call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> undef, <8 x float> undef, <8 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %65 = call <16 x float> @llvm.vp.merge.v16f32(<16 x i1> undef, <16 x float> undef, <16 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %66 = call <32 x float> @llvm.vp.merge.v32f32(<32 x i1> undef, <32 x float> undef, <32 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %67 = call <vscale x 1 x float> @llvm.vp.merge.nxv1f32(<vscale x 1 x i1> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %68 = call <vscale x 2 x float> @llvm.vp.merge.nxv2f32(<vscale x 2 x i1> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %69 = call <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %70 = call <vscale x 8 x float> @llvm.vp.merge.nxv8f32(<vscale x 8 x i1> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %71 = call <vscale x 16 x float> @llvm.vp.merge.nxv16f32(<vscale x 16 x i1> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %72 = call <vscale x 32 x float> @llvm.vp.merge.nxv32f32(<vscale x 32 x i1> undef, <vscale x 32 x float> undef, <vscale x 32 x float> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %73 = call <1 x double> @llvm.vp.merge.v1f64(<1 x i1> undef, <1 x double> undef, <1 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %74 = call <2 x double> @llvm.vp.merge.v2f64(<2 x i1> undef, <2 x double> undef, <2 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %75 = call <4 x double> @llvm.vp.merge.v4f64(<4 x i1> undef, <4 x double> undef, <4 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %76 = call <8 x double> @llvm.vp.merge.v8f64(<8 x i1> undef, <8 x double> undef, <8 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %77 = call <16 x double> @llvm.vp.merge.v16f64(<16 x i1> undef, <16 x double> undef, <16 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %78 = call <32 x double> @llvm.vp.merge.v32f64(<32 x i1> undef, <32 x double> undef, <32 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %79 = call <vscale x 1 x double> @llvm.vp.merge.nxv1f64(<vscale x 1 x i1> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %80 = call <vscale x 2 x double> @llvm.vp.merge.nxv2f64(<vscale x 2 x i1> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %81 = call <vscale x 4 x double> @llvm.vp.merge.nxv4f64(<vscale x 4 x i1> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %82 = call <vscale x 8 x double> @llvm.vp.merge.nxv8f64(<vscale x 8 x i1> undef, <vscale x 8 x double> undef, <vscale x 8 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %83 = call <vscale x 16 x double> @llvm.vp.merge.nxv16f64(<vscale x 16 x i1> undef, <vscale x 16 x double> undef, <vscale x 16 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %84 = call <vscale x 32 x double> @llvm.vp.merge.nxv32f64(<vscale x 32 x i1> undef, <vscale x 32 x double> undef, <vscale x 32 x double> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ call <1 x i1> @llvm.vp.merge.v1i1(<1 x i1> undef, <1 x i1> undef, <1 x i1> undef, i32 undef)
+ call <2 x i1> @llvm.vp.merge.v2i1(<2 x i1> undef, <2 x i1> undef, <2 x i1> undef, i32 undef)
+ call <4 x i1> @llvm.vp.merge.v4i1(<4 x i1> undef, <4 x i1> undef, <4 x i1> undef, i32 undef)
+ call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> undef, <8 x i1> undef, <8 x i1> undef, i32 undef)
+ call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> undef, <16 x i1> undef, <16 x i1> undef, i32 undef)
+ call <32 x i1> @llvm.vp.merge.v32i1(<32 x i1> undef, <32 x i1> undef, <32 x i1> undef, i32 undef)
+ call <vscale x 1 x i1> @llvm.vp.merge.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+ call <vscale x 2 x i1> @llvm.vp.merge.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+ call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+ call <vscale x 8 x i1> @llvm.vp.merge.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+ call <vscale x 16 x i1> @llvm.vp.merge.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+ call <vscale x 32 x i1> @llvm.vp.merge.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+
+ call <1 x i8> @llvm.vp.merge.v1i8(<1 x i1> undef, <1 x i8> undef, <1 x i8> undef, i32 undef)
+ call <2 x i8> @llvm.vp.merge.v2i8(<2 x i1> undef, <2 x i8> undef, <2 x i8> undef, i32 undef)
+ call <4 x i8> @llvm.vp.merge.v4i8(<4 x i1> undef, <4 x i8> undef, <4 x i8> undef, i32 undef)
+ call <8 x i8> @llvm.vp.merge.v8i8(<8 x i1> undef, <8 x i8> undef, <8 x i8> undef, i32 undef)
+ call <16 x i8> @llvm.vp.merge.v16i8(<16 x i1> undef, <16 x i8> undef, <16 x i8> undef, i32 undef)
+ call <32 x i8> @llvm.vp.merge.v32i8(<32 x i1> undef, <32 x i8> undef, <32 x i8> undef, i32 undef)
+ call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> undef, <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, i32 undef)
+ call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> undef, <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, i32 undef)
+ call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> undef, <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, i32 undef)
+ call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> undef, <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, i32 undef)
+ call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, i32 undef)
+ call <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1> undef, <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, i32 undef)
+
+ call <1 x i16> @llvm.vp.merge.v1i16(<1 x i1> undef, <1 x i16> undef, <1 x i16> undef, i32 undef)
+ call <2 x i16> @llvm.vp.merge.v2i16(<2 x i1> undef, <2 x i16> undef, <2 x i16> undef, i32 undef)
+ call <4 x i16> @llvm.vp.merge.v4i16(<4 x i1> undef, <4 x i16> undef, <4 x i16> undef, i32 undef)
+ call <8 x i16> @llvm.vp.merge.v8i16(<8 x i1> undef, <8 x i16> undef, <8 x i16> undef, i32 undef)
+ call <16 x i16> @llvm.vp.merge.v16i16(<16 x i1> undef, <16 x i16> undef, <16 x i16> undef, i32 undef)
+ call <32 x i16> @llvm.vp.merge.v32i16(<32 x i1> undef, <32 x i16> undef, <32 x i16> undef, i32 undef)
+ call <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1> undef, <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, i32 undef)
+ call <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1> undef, <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, i32 undef)
+ call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> undef, <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, i32 undef)
+ call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, i32 undef)
+ call <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, i32 undef)
+ call <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1> undef, <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, i32 undef)
+
+ call <1 x i32> @llvm.vp.merge.v1i32(<1 x i1> undef, <1 x i32> undef, <1 x i32> undef, i32 undef)
+ call <2 x i32> @llvm.vp.merge.v2i32(<2 x i1> undef, <2 x i32> undef, <2 x i32> undef, i32 undef)
+ call <4 x i32> @llvm.vp.merge.v4i32(<4 x i1> undef, <4 x i32> undef, <4 x i32> undef, i32 undef)
+ call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> undef, <8 x i32> undef, <8 x i32> undef, i32 undef)
+ call <16 x i32> @llvm.vp.merge.v16i32(<16 x i1> undef, <16 x i32> undef, <16 x i32> undef, i32 undef)
+ call <32 x i32> @llvm.vp.merge.v32i32(<32 x i1> undef, <32 x i32> undef, <32 x i32> undef, i32 undef)
+ call <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, i32 undef)
+ call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, i32 undef)
+ call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, i32 undef)
+ call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, i32 undef)
+ call <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, i32 undef)
+ call <vscale x 32 x i32> @llvm.vp.merge.nxv32i32(<vscale x 32 x i1> undef, <vscale x 32 x i32> undef, <vscale x 32 x i32> undef, i32 undef)
+ call <1 x i64> @llvm.vp.merge.v1i64(<1 x i1> undef, <1 x i64> undef, <1 x i64> undef, i32 undef)
+ call <2 x i64> @llvm.vp.merge.v2i64(<2 x i1> undef, <2 x i64> undef, <2 x i64> undef, i32 undef)
+ call <4 x i64> @llvm.vp.merge.v4i64(<4 x i1> undef, <4 x i64> undef, <4 x i64> undef, i32 undef)
+ call <8 x i64> @llvm.vp.merge.v8i64(<8 x i1> undef, <8 x i64> undef, <8 x i64> undef, i32 undef)
+ call <16 x i64> @llvm.vp.merge.v16i64(<16 x i1> undef, <16 x i64> undef, <16 x i64> undef, i32 undef)
+ call <32 x i64> @llvm.vp.merge.v32i64(<32 x i1> undef, <32 x i64> undef, <32 x i64> undef, i32 undef)
+ call <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1> undef, <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, i32 undef)
+ call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, i32 undef)
+ call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, i32 undef)
+ call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, i32 undef)
+ call <vscale x 16 x i64> @llvm.vp.merge.nxv16i64(<vscale x 16 x i1> undef, <vscale x 16 x i64> undef, <vscale x 16 x i64> undef, i32 undef)
+ call <vscale x 32 x i64> @llvm.vp.merge.nxv32i64(<vscale x 32 x i1> undef, <vscale x 32 x i64> undef, <vscale x 32 x i64> undef, i32 undef)
+
+ call <1 x float> @llvm.vp.merge.v1f32(<1 x i1> undef, <1 x float> undef, <1 x float> undef, i32 undef)
+ call <2 x float> @llvm.vp.merge.v2f32(<2 x i1> undef, <2 x float> undef, <2 x float> undef, i32 undef)
+ call <4 x float> @llvm.vp.merge.v4f32(<4 x i1> undef, <4 x float> undef, <4 x float> undef, i32 undef)
+ call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> undef, <8 x float> undef, <8 x float> undef, i32 undef)
+ call <16 x float> @llvm.vp.merge.v16f32(<16 x i1> undef, <16 x float> undef, <16 x float> undef, i32 undef)
+ call <32 x float> @llvm.vp.merge.v32f32(<32 x i1> undef, <32 x float> undef, <32 x float> undef, i32 undef)
+ call <vscale x 1 x float> @llvm.vp.merge.nxv1f32(<vscale x 1 x i1> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef, i32 undef)
+ call <vscale x 2 x float> @llvm.vp.merge.nxv2f32(<vscale x 2 x i1> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef, i32 undef)
+ call <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef, i32 undef)
+ call <vscale x 8 x float> @llvm.vp.merge.nxv8f32(<vscale x 8 x i1> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef, i32 undef)
+ call <vscale x 16 x float> @llvm.vp.merge.nxv16f32(<vscale x 16 x i1> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef, i32 undef)
+ call <vscale x 32 x float> @llvm.vp.merge.nxv32f32(<vscale x 32 x i1> undef, <vscale x 32 x float> undef, <vscale x 32 x float> undef, i32 undef)
+
+ call <1 x double> @llvm.vp.merge.v1f64(<1 x i1> undef, <1 x double> undef, <1 x double> undef, i32 undef)
+ call <2 x double> @llvm.vp.merge.v2f64(<2 x i1> undef, <2 x double> undef, <2 x double> undef, i32 undef)
+ call <4 x double> @llvm.vp.merge.v4f64(<4 x i1> undef, <4 x double> undef, <4 x double> undef, i32 undef)
+ call <8 x double> @llvm.vp.merge.v8f64(<8 x i1> undef, <8 x double> undef, <8 x double> undef, i32 undef)
+ call <16 x double> @llvm.vp.merge.v16f64(<16 x i1> undef, <16 x double> undef, <16 x double> undef, i32 undef)
+ call <32 x double> @llvm.vp.merge.v32f64(<32 x i1> undef, <32 x double> undef, <32 x double> undef, i32 undef)
+ call <vscale x 1 x double> @llvm.vp.merge.nxv1f64(<vscale x 1 x i1> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef, i32 undef)
+ call <vscale x 2 x double> @llvm.vp.merge.nxv2f64(<vscale x 2 x i1> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef, i32 undef)
+ call <vscale x 4 x double> @llvm.vp.merge.nxv4f64(<vscale x 4 x i1> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef, i32 undef)
+ call <vscale x 8 x double> @llvm.vp.merge.nxv8f64(<vscale x 8 x i1> undef, <vscale x 8 x double> undef, <vscale x 8 x double> undef, i32 undef)
+ call <vscale x 16 x double> @llvm.vp.merge.nxv16f64(<vscale x 16 x i1> undef, <vscale x 16 x double> undef, <vscale x 16 x double> undef, i32 undef)
+ call <vscale x 32 x double> @llvm.vp.merge.nxv32f64(<vscale x 32 x i1> undef, <vscale x 32 x double> undef, <vscale x 32 x double> undef, i32 undef)
+
+ ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/splice.ll b/llvm/test/Analysis/CostModel/RISCV/splice.ll
index 8d7d157..ddfaa8c 100644
--- a/llvm/test/Analysis/CostModel/RISCV/splice.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/splice.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s
; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE
@@ -34,6 +34,13 @@ define void @vector_splice() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1bf16 = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32bf16 = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> zeroinitializer, <vscale x 32 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64bf16 = call <vscale x 64 x bfloat> @llvm.vector.splice.nxv64bf16(<vscale x 64 x bfloat> zeroinitializer, <vscale x 64 x bfloat> zeroinitializer, i32 -1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
@@ -86,6 +93,13 @@ define void @vector_splice() {
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
; SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv1bf16 = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv32bf16 = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> zeroinitializer, <vscale x 32 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv64bf16 = call <vscale x 64 x bfloat> @llvm.vector.splice.nxv64bf16(<vscale x 64 x bfloat> zeroinitializer, <vscale x 64 x bfloat> zeroinitializer, i32 -1)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
@@ -141,6 +155,14 @@ define void @vector_splice() {
%splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
%splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+ %splice.nxv1bf16 = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+ %splice.nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+ %splice.nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+ %splice.nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+ %splice.nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+ %splice.nxv32bf16 = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> zeroinitializer, <vscale x 32 x bfloat> zeroinitializer, i32 -1)
+ %splice.nxv64bf16 = call <vscale x 64 x bfloat> @llvm.vector.splice.nxv64bf16(<vscale x 64 x bfloat> zeroinitializer, <vscale x 64 x bfloat> zeroinitializer, i32 -1)
+
%splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
%splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
%splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
diff --git a/llvm/test/Assembler/fp-intrinsics-attr.ll b/llvm/test/Assembler/fp-intrinsics-attr.ll
index da6507f..5b9a447 100644
--- a/llvm/test/Assembler/fp-intrinsics-attr.ll
+++ b/llvm/test/Assembler/fp-intrinsics-attr.ll
@@ -105,6 +105,11 @@ define void @func(double %a, double %b, double %c, i32 %i) strictfp {
metadata !"round.dynamic",
metadata !"fpexcept.strict")
+ %atan2 = call double @llvm.experimental.constrained.atan2.f64(
+ double %a, double %b,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict")
+
%cosh = call double @llvm.experimental.constrained.cosh.f64(
double %a,
metadata !"round.dynamic",
@@ -291,6 +296,9 @@ declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadat
declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata)
; CHECK: @llvm.experimental.constrained.atan.f64({{.*}}) #[[ATTR1]]
+declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata)
+; CHECK: @llvm.experimental.constrained.atan2.f64({{.*}}) #[[ATTR1]]
+
declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata)
; CHECK: @llvm.experimental.constrained.sinh.f64({{.*}}) #[[ATTR1]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index a21b786..146d117 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -152,12 +152,12 @@
#
# DEBUG-NEXT: G_INTRINSIC_TRUNC (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_INTRINSIC_ROUND (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 2, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -167,8 +167,8 @@
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
@@ -205,34 +205,34 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_ATOMIC_CMPXCHG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_ATOMICRMW_XCHG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_ATOMICRMW_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_ATOMICRMW_SUB (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_ATOMICRMW_AND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_ATOMICRMW_NAND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: G_ATOMICRMW_OR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_ATOMICRMW_XOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_ATOMICRMW_MAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
@@ -310,8 +310,8 @@
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FCONSTANT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_VASTART (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -459,27 +459,27 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FSUB (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FMUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FMA (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FMAD (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_FDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -565,12 +565,12 @@
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: G_FMINNUM (opcode {{[0-9]+}}): 1 type index
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FMAXNUM (opcode {{[0-9]+}}): 1 type index
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FMINNUM_IEEE (opcode {{[0-9]+}}): 1 type index
# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
@@ -579,12 +579,12 @@
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: G_FMINIMUM (opcode {{[0-9]+}}): 1 type index
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FMAXIMUM (opcode {{[0-9]+}}): 1 type index
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_GET_FPENV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
@@ -692,8 +692,8 @@
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FCOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -734,20 +734,20 @@
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FFLOOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FRINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FNEARBYINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_ADDRSPACE_CAST (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll
new file mode 100644
index 0000000..30ce0cb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; FIXME: Currently, we avoid narrowing this v4i32 load, in the
+; hopes of being able to fold the shift, despite it requiring stack
+; storage + loads. Ideally, we should narrow here and load the i32
+; directly from the variable offset e.g:
+;
+; add x8, x0, x1, lsl #4
+; and x9, x2, #0x3
+; ldr w0, [x8, x9, lsl #2]
+;
+; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should
+; probably be updated to choose load-narrowing instead of folding the
+; lsl in larger vector cases.
+;
+define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) {
+; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ldr q0, [x0, x1, lsl #4]
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
+; CHECK-NEXT: bfi x8, x2, #2, #2
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: ldr w0, [x8]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+entry:
+ %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off
+ %x = load <4 x i32>, ptr %idx, align 8
+ %res = extractelement <4 x i32> %x, i32 %ele
+ ret i32 %res
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
index 4ca2fb8..068e1947 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
@@ -1,84 +1,121 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve2p1 -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-linux-gnu"
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_3b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
define <vscale x 16 x i8> @test_svadd_i8(<vscale x 16 x i8> %Zn, <vscale x 16 x i8> %Zm) {
+ ; CHECK-LABEL: name: test_svadd_i8
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $z0, $z1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr_3b = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"add $0.b, $1.b, $2.b", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5373961 /* reguse:ZPR_3b */, [[COPY3]]
+ ; CHECK-NEXT: $z0 = COPY %2
+ ; CHECK-NEXT: RET_ReallyLR implicit $z0
%1 = tail call <vscale x 16 x i8> asm "add $0.b, $1.b, $2.b", "=w,w,y"(<vscale x 16 x i8> %Zn, <vscale x 16 x i8> %Zm)
ret <vscale x 16 x i8> %1
}
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_4b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
define <vscale x 2 x i64> @test_svsub_i64(<vscale x 2 x i64> %Zn, <vscale x 2 x i64> %Zm) {
+ ; CHECK-LABEL: name: test_svsub_i64
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $z0, $z1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr_4b = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"sub $0.d, $1.d, $2.d", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5242889 /* reguse:ZPR_4b */, [[COPY3]]
+ ; CHECK-NEXT: $z0 = COPY %2
+ ; CHECK-NEXT: RET_ReallyLR implicit $z0
%1 = tail call <vscale x 2 x i64> asm "sub $0.d, $1.d, $2.d", "=w,w,x"(<vscale x 2 x i64> %Zn, <vscale x 2 x i64> %Zm)
ret <vscale x 2 x i64> %1
}
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_3b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
define <vscale x 8 x half> @test_svfmul_f16(<vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+ ; CHECK-LABEL: name: test_svfmul_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $z0, $z1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr_3b = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"fmul $0.h, $1.h, $2.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5373961 /* reguse:ZPR_3b */, [[COPY3]]
+ ; CHECK-NEXT: $z0 = COPY %2
+ ; CHECK-NEXT: RET_ReallyLR implicit $z0
%1 = tail call <vscale x 8 x half> asm "fmul $0.h, $1.h, $2.h", "=w,w,y"(<vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
ret <vscale x 8 x half> %1
}
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_4b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
define <vscale x 4 x float> @test_svfmul_f(<vscale x 4 x float> %Zn, <vscale x 4 x float> %Zm) {
+ ; CHECK-LABEL: name: test_svfmul_f
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $z0, $z1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr_4b = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"fmul $0.s, $1.s, $2.s", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5242889 /* reguse:ZPR_4b */, [[COPY3]]
+ ; CHECK-NEXT: $z0 = COPY %2
+ ; CHECK-NEXT: RET_ReallyLR implicit $z0
%1 = tail call <vscale x 4 x float> asm "fmul $0.s, $1.s, $2.s", "=w,w,x"(<vscale x 4 x float> %Zn, <vscale x 4 x float> %Zm)
ret <vscale x 4 x float> %1
}
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG4:%[0-9]+]]:ppr_3b = COPY [[ARG3]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
define <vscale x 8 x half> @test_svfadd_f16(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+ ; CHECK-LABEL: name: test_svfadd_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $p0, $z0, $z1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY $p0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr_3b = COPY [[COPY2]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 589833 /* reguse:PPR_3b */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]]
+ ; CHECK-NEXT: $z0 = COPY %3
+ ; CHECK-NEXT: RET_ReallyLR implicit $z0
%1 = tail call <vscale x 8 x half> asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Upl,w,w"(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
ret <vscale x 8 x half> %1
}
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG2:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG3]]
define <vscale x 4 x i32> @test_incp(<vscale x 16 x i1> %Pg, <vscale x 4 x i32> %Zn) {
+ ; CHECK-LABEL: name: test_incp
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $p0, $z0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ppr = COPY $p0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"incp $0.s, $1", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 393225 /* reguse:PPR */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+ ; CHECK-NEXT: $z0 = COPY %2
+ ; CHECK-NEXT: RET_ReallyLR implicit $z0
%1 = tail call <vscale x 4 x i32> asm "incp $0.s, $1", "=w,@3Upa,0"(<vscale x 16 x i1> %Pg, <vscale x 4 x i32> %Zn)
ret <vscale x 4 x i32> %1
}
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG4:%[0-9]+]]:ppr_p8to15 = COPY [[ARG3]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
define <vscale x 8 x half> @test_svfadd_f16_Uph_constraint(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+ ; CHECK-LABEL: name: test_svfadd_f16_Uph_constraint
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $p0, $z0, $z1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY $p0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr_p8to15 = COPY [[COPY2]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 655369 /* reguse:PPR_p8to15 */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]]
+ ; CHECK-NEXT: $z0 = COPY %3
+ ; CHECK-NEXT: RET_ReallyLR implicit $z0
%1 = tail call <vscale x 8 x half> asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Uph,w,w"(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
ret <vscale x 8 x half> %1
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
index 425a232..ab5e320 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=arm64-apple-ios -mcpu=apple-m1 -run-pass=early-ifcvt -o - %s | FileCheck %s
+# RUN: llc -mtriple=arm64-apple-ios -mcpu=apple-m1 -passes=early-ifcvt -o - %s | FileCheck %s
--- |
define void @test_cond_is_load_with_invariant_ops() {
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
index 318bdce..a7f67f8 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
@@ -1,4 +1,5 @@
# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=early-ifcvt -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-unknown-unknown -passes=early-ifcvt -verify-each %s -o - | FileCheck %s
--- |
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios13.3.0"
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
index b929860..16d5dfc 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=aarch64-- -run-pass=early-ifcvt -stress-early-ifcvt -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-- -passes=early-ifcvt -stress-early-ifcvt %s -o - | FileCheck %s
---
name: fmov0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
new file mode 100644
index 0000000..9aadf31
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define double @t1(double %x) {
+; CHECK-LABEL: t1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs x8, d0
+; CHECK-NEXT: scvtf d0, x8
+; CHECK-NEXT: ret
+;
+; NON-STREAMING-LABEL: t1:
+; NON-STREAMING: // %bb.0: // %entry
+; NON-STREAMING-NEXT: fcvtzs d0, d0
+; NON-STREAMING-NEXT: scvtf d0, d0
+; NON-STREAMING-NEXT: ret
+entry:
+ %conv = fptosi double %x to i64
+ %conv1 = sitofp i64 %conv to double
+ ret double %conv1
+}
+
+define float @t2(float %x) {
+; CHECK-LABEL: t2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: ret
+;
+; NON-STREAMING-LABEL: t2:
+; NON-STREAMING: // %bb.0: // %entry
+; NON-STREAMING-NEXT: fcvtzs s0, s0
+; NON-STREAMING-NEXT: scvtf s0, s0
+; NON-STREAMING-NEXT: ret
+entry:
+ %conv = fptosi float %x to i32
+ %conv1 = sitofp i32 %conv to float
+ ret float %conv1
+}
+
+define half @t3(half %x) {
+; CHECK-LABEL: t3:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+;
+; NON-STREAMING-LABEL: t3:
+; NON-STREAMING: // %bb.0: // %entry
+; NON-STREAMING-NEXT: fcvt s0, h0
+; NON-STREAMING-NEXT: fcvtzs s0, s0
+; NON-STREAMING-NEXT: scvtf s0, s0
+; NON-STREAMING-NEXT: fcvt h0, s0
+; NON-STREAMING-NEXT: ret
+entry:
+ %conv = fptosi half %x to i32
+ %conv1 = sitofp i32 %conv to half
+ ret half %conv1
+}
+
+define double @t4(double %x) {
+; CHECK-LABEL: t4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu x8, d0
+; CHECK-NEXT: ucvtf d0, x8
+; CHECK-NEXT: ret
+;
+; NON-STREAMING-LABEL: t4:
+; NON-STREAMING: // %bb.0: // %entry
+; NON-STREAMING-NEXT: fcvtzu d0, d0
+; NON-STREAMING-NEXT: ucvtf d0, d0
+; NON-STREAMING-NEXT: ret
+entry:
+ %conv = fptoui double %x to i64
+ %conv1 = uitofp i64 %conv to double
+ ret double %conv1
+}
+
+define float @t5(float %x) {
+; CHECK-LABEL: t5:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu w8, s0
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: ret
+;
+; NON-STREAMING-LABEL: t5:
+; NON-STREAMING: // %bb.0: // %entry
+; NON-STREAMING-NEXT: fcvtzu s0, s0
+; NON-STREAMING-NEXT: ucvtf s0, s0
+; NON-STREAMING-NEXT: ret
+entry:
+ %conv = fptoui float %x to i32
+ %conv1 = uitofp i32 %conv to float
+ ret float %conv1
+}
+
+define half @t6(half %x) {
+; CHECK-LABEL: t6:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzu w8, s0
+; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+;
+; NON-STREAMING-LABEL: t6:
+; NON-STREAMING: // %bb.0: // %entry
+; NON-STREAMING-NEXT: fcvt s0, h0
+; NON-STREAMING-NEXT: fcvtzu s0, s0
+; NON-STREAMING-NEXT: ucvtf s0, s0
+; NON-STREAMING-NEXT: fcvt h0, s0
+; NON-STREAMING-NEXT: ret
+entry:
+ %conv = fptoui half %x to i32
+ %conv1 = uitofp i32 %conv to half
+ ret half %conv1
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index afd3bb7..0c712a1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -21,20 +21,20 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str d0, [sp, #-16]!
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #14]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #12]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #10]
-; NONEON-NOSVE-NEXT: ldr h0, [sp]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #8]
; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
@@ -58,36 +58,36 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: ldr q0, [x0]
; NONEON-NOSVE-NEXT: str q0, [sp, #-32]!
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #30]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #28]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #26]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #24]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #22]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #20]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #18]
-; NONEON-NOSVE-NEXT: ldr h0, [sp]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #16]
; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
@@ -115,68 +115,68 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #30]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #62]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #28]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #60]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #26]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #58]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #24]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #56]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #54]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #52]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #50]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #48]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #46]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #44]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #42]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #40]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #38]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #36]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp]
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #34]
-; NONEON-NOSVE-NEXT: ldr h0, [sp]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: str h0, [sp, #32]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
@@ -207,11 +207,11 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str d0, [sp, #-16]!
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
-; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT: ldrh w9, [sp]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ucvtf s1, w9
+; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #8]
; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
@@ -234,15 +234,15 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
; NONEON-NOSVE-NEXT: sub sp, sp, #32
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT: str d0, [sp, #8]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16]
; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
; NONEON-NOSVE-NEXT: add sp, sp, #32
@@ -271,25 +271,25 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
; NONEON-NOSVE-NEXT: ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #30]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #28]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #26]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #24]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
; NONEON-NOSVE-NEXT: stp q1, q0, [x1]
@@ -328,47 +328,47 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #46]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #44]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #42]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #40]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #38]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #36]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #34]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #32]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60]
; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #60]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #58]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #56]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #54]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #52]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50]
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #50]
-; NONEON-NOSVE-NEXT: ucvtf s1, s0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #48]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ucvtf s1, w8
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96]
; NONEON-NOSVE-NEXT: stp q2, q3, [x1]
@@ -399,8 +399,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
; NONEON-NOSVE-NEXT: sub sp, sp, #16
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT: str d0, [sp, #8]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: str d0, [sp]
; NONEON-NOSVE-NEXT: ldr d0, [sp], #16
; NONEON-NOSVE-NEXT: ret
@@ -424,11 +424,11 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
; NONEON-NOSVE-NEXT: sub sp, sp, #32
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT: str d0, [sp, #8]
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
-; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
@@ -464,15 +464,13 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16]
; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #44]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #40]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #36]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #32]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48]
; NONEON-NOSVE-NEXT: stp q1, q0, [x1]
@@ -529,27 +527,23 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80]
; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #92]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #88]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #80]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #144]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #84]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #80]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #72]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #128]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64]
; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #72]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #68]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #64]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96]
; NONEON-NOSVE-NEXT: stp q2, q3, [x1]
@@ -649,49 +643,42 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104]
; NONEON-NOSVE-NEXT: str d1, [sp, #328]
; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104]
-; NONEON-NOSVE-NEXT: str d0, [sp, #168]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #160]
; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #160]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: str d0, [sp, #168]
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #152]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #156]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #152]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #144]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #148]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #144]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #136]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #140]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #136]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldr w8, [sp, #332]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT: ucvtf d1, w8
+; NONEON-NOSVE-NEXT: ldr w8, [sp, #328]
; NONEON-NOSVE-NEXT: ldp q4, q3, [sp, #192]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #328]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #304]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #188]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #184]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #288]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168]
; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #288]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #176]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #272]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #172]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #168]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #256]
; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224]
; NONEON-NOSVE-NEXT: ldp q2, q5, [sp, #256]
@@ -1041,10 +1028,9 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
; NONEON-NOSVE-NEXT: sub sp, sp, #32
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT: str d0, [sp, #8]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #12]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #8]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16]
; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
; NONEON-NOSVE-NEXT: add sp, sp, #32
@@ -1073,15 +1059,13 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
; NONEON-NOSVE-NEXT: ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #28]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #24]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #20]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #16]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
; NONEON-NOSVE-NEXT: stp q1, q0, [x1]
@@ -1120,27 +1104,23 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #44]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #40]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #36]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #32]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56]
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
+; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48]
; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #56]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #52]
-; NONEON-NOSVE-NEXT: ucvtf d1, d0
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #48]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ucvtf d1, w9
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96]
; NONEON-NOSVE-NEXT: stp q2, q3, [x1]
@@ -2984,8 +2964,8 @@ define half @ucvtf_i16_f16(ptr %0) {
;
; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: ldr h0, [x0]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ldrh w8, [x0]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: ret
%2 = load i16, ptr %0, align 64
@@ -2996,14 +2976,14 @@ define half @ucvtf_i16_f16(ptr %0) {
define float @ucvtf_i16_f32(ptr %0) {
; CHECK-LABEL: ucvtf_i16_f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr h0, [x0]
-; CHECK-NEXT: ucvtf s0, s0
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ucvtf s0, w8
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: ldr h0, [x0]
-; NONEON-NOSVE-NEXT: ucvtf s0, s0
+; NONEON-NOSVE-NEXT: ldrh w8, [x0]
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: ret
%2 = load i16, ptr %0, align 64
%3 = uitofp i16 %2 to float
@@ -3013,14 +2993,14 @@ define float @ucvtf_i16_f32(ptr %0) {
define double @ucvtf_i16_f64(ptr %0) {
; CHECK-LABEL: ucvtf_i16_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr h0, [x0]
-; CHECK-NEXT: ucvtf d0, d0
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ucvtf d0, w8
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: ucvtf_i16_f64:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: ldr h0, [x0]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ldrh w8, [x0]
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: ret
%2 = load i16, ptr %0, align 64
%3 = uitofp i16 %2 to double
@@ -3065,14 +3045,14 @@ define float @ucvtf_i32_f32(ptr %0) {
define double @ucvtf_i32_f64(ptr %0) {
; CHECK-LABEL: ucvtf_i32_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr s0, [x0]
-; CHECK-NEXT: ucvtf d0, d0
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ucvtf d0, w8
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: ucvtf_i32_f64:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: ldr s0, [x0]
-; NONEON-NOSVE-NEXT: ucvtf d0, d0
+; NONEON-NOSVE-NEXT: ldr w8, [x0]
+; NONEON-NOSVE-NEXT: ucvtf d0, w8
; NONEON-NOSVE-NEXT: ret
%2 = load i32, ptr %0, align 64
%3 = uitofp i32 %2 to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
index 6dce6c1..6e4fb26 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
@@ -27,10 +27,8 @@ define hidden <2 x i64> @icmp_v2i32_zext_to_v2i64(<2 x i32> %arg) {
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; CHECK-NEXT: v_and_b32_e32 v2, 1, v1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; CHECK-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq <2 x i32> %arg, zeroinitializer
%sext = zext <2 x i1> %cmp to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
index 11411c6..1971cd8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
@@ -2,7 +2,7 @@
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
# Note: 16-bit instructions generally produce a 0 result in the high 16-bits on GFX8 and GFX9 and preserve high 16 bits on GFX10+
@@ -23,6 +23,7 @@ body: |
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX6-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+ ;
; GFX10-LABEL: name: add_s16
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -30,6 +31,14 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_e64_]]
+ ;
+ ; GFX11-LABEL: name: add_s16
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -56,6 +65,7 @@ body: |
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX6-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+ ;
; GFX10-LABEL: name: add_s16_zext_to_s32
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -65,6 +75,16 @@ body: |
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_e64_]], implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ;
+ ; GFX11-LABEL: name: add_s16_zext_to_s32
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_fake16_e64_]], implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -91,12 +111,20 @@ body: |
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+ ;
; GFX10-LABEL: name: add_s16_neg_inline_const_64
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_e64_]]
+ ;
+ ; GFX11-LABEL: name: add_s16_neg_inline_const_64
+ ; GFX11: liveins: $vgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[V_SUB_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_fake16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_CONSTANT i16 -64
@@ -121,6 +149,7 @@ body: |
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+ ;
; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -129,6 +158,15 @@ body: |
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_e64_]], implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ;
+ ; GFX11-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
+ ; GFX11: liveins: $vgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[V_SUB_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_fake16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+ ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_fake16_e64_]], implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_CONSTANT i16 -64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
index 17cdab4..b5f91b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
---
name: fcmp_false_f16
@@ -10,15 +11,27 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_false_f16
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+ ; GFX11-TRUE16-LABEL: name: fcmp_false_f16
+ ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
+ ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fcmp_false_f16
+ ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_FPTRUNC %0
@@ -36,15 +49,27 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_true_f16
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+ ; GFX11-TRUE16-LABEL: name: fcmp_true_f16
+ ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
+ ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fcmp_true_f16
+ ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_FPTRUNC %0
@@ -62,13 +87,13 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_false_f32
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]]
+ ; GFX11-LABEL: name: fcmp_false_f32
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%4:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 0
@@ -84,13 +109,13 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_true_f32
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]]
+ ; GFX11-LABEL: name: fcmp_true_f32
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%4:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 15
@@ -106,15 +131,15 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_false_f64
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]]
+ ; GFX11-LABEL: name: fcmp_false_f64
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s64) = G_FPEXT %0
@@ -132,15 +157,15 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_true_f64
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]]
+ ; GFX11-LABEL: name: fcmp_true_f64
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s64) = G_FPEXT %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
index 158076a3..a67a0b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64",+real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64",-real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
---
name: fcmp_false_f16
@@ -10,15 +11,27 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_false_f16
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+ ; GFX11-TRUE16-LABEL: name: fcmp_false_f16
+ ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
+ ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fcmp_false_f16
+ ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_FPTRUNC %0
@@ -36,15 +49,27 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_true_f16
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+ ; GFX11-TRUE16-LABEL: name: fcmp_true_f16
+ ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
+ ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fcmp_true_f16
+ ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_FPTRUNC %0
@@ -62,13 +87,13 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_false_f32
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]]
+ ; GFX11-LABEL: name: fcmp_false_f32
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%4:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 0
@@ -84,13 +109,13 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_true_f32
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]]
+ ; GFX11-LABEL: name: fcmp_true_f32
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%4:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 15
@@ -106,15 +131,15 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_false_f64
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]]
+ ; GFX11-LABEL: name: fcmp_false_f64
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s64) = G_FPEXT %0
@@ -132,15 +157,15 @@ tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_true_f64
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]]
+ ; GFX11-LABEL: name: fcmp_true_f64
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s64) = G_FPEXT %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 0ff633f..df2f390 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s
---
@@ -45,15 +45,15 @@ body: |
; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
;
- ; GFX11-LABEL: name: fceil_s16_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; GFX11-TRUE16-LABEL: name: fceil_s16_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
;
; GFX11-FAKE16-LABEL: name: fceil_s16_vv
; GFX11-FAKE16: liveins: $vgpr0
@@ -85,14 +85,14 @@ body: |
; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
;
- ; GFX11-LABEL: name: fceil_s16_vs
- ; GFX11: liveins: $sgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; GFX11-TRUE16-LABEL: name: fceil_s16_vs
+ ; GFX11-TRUE16: liveins: $sgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
;
; GFX11-FAKE16-LABEL: name: fceil_s16_vs
; GFX11-FAKE16: liveins: $sgpr0
@@ -124,15 +124,15 @@ body: |
; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
;
- ; GFX11-LABEL: name: fceil_fneg_s16_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; GFX11-TRUE16-LABEL: name: fceil_fneg_s16_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
;
; GFX11-FAKE16-LABEL: name: fceil_fneg_s16_vv
; GFX11-FAKE16: liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index fc8a6aa..df62806 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s
---
@@ -54,15 +54,15 @@ body: |
; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
;
- ; GFX11-LABEL: name: ffloor_s16_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; GFX11-TRUE16-LABEL: name: ffloor_s16_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
;
; GFX11-FAKE16-LABEL: name: ffloor_s16_vv
; GFX11-FAKE16: liveins: $vgpr0
@@ -94,14 +94,14 @@ body: |
; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
;
- ; GFX11-LABEL: name: ffloor_s16_vs
- ; GFX11: liveins: $sgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; GFX11-TRUE16-LABEL: name: ffloor_s16_vs
+ ; GFX11-TRUE16: liveins: $sgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
;
; GFX11-FAKE16-LABEL: name: ffloor_s16_vs
; GFX11-FAKE16: liveins: $sgpr0
@@ -133,15 +133,15 @@ body: |
; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
;
- ; GFX11-LABEL: name: ffloor_fneg_s16_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; GFX11-TRUE16-LABEL: name: ffloor_fneg_s16_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
;
; GFX11-FAKE16-LABEL: name: ffloor_fneg_s16_vv
; GFX11-FAKE16: liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
index 32a73bc..03cb907 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
@@ -1,7 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=VI
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
---
name: fptosi_s32_to_s32_vv
@@ -135,13 +136,22 @@ body: |
; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptosi_s16_to_s32_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s32) = G_FPTOSI %1
@@ -174,13 +184,21 @@ body: |
; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptosi_s16_to_s32_vs
- ; GFX11: liveins: $sgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_vs
+ ; GFX11-TRUE16: liveins: $sgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_vs
+ ; GFX11-FAKE16: liveins: $sgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s16) = G_TRUNC %0
%2:vgpr(s32) = G_FPTOSI %1
@@ -217,15 +235,25 @@ body: |
; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptosi_s16_to_s32_fneg_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
- ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_fneg_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_fneg_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
+ ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_FNEG %1
@@ -259,13 +287,23 @@ body: |
; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptosi_s16_to_s1_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s32) = G_FPTOSI %1
@@ -299,13 +337,22 @@ body: |
; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptosi_s16_to_s1_vs
- ; GFX11: liveins: $sgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_vs
+ ; GFX11-TRUE16: liveins: $sgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY1]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_vs
+ ; GFX11-FAKE16: liveins: $sgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s16) = G_TRUNC %0
%2:vgpr(s32) = G_FPTOSI %1
@@ -343,15 +390,26 @@ body: |
; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptosi_s16_to_s1_fneg_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
- ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_fneg_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_fneg_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
+ ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_FNEG %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
index 47a0918..521a0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
@@ -1,7 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=VI
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
---
@@ -85,13 +86,22 @@ body: |
; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptoui_s16_to_s32_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s32) = G_FPTOUI %1
@@ -124,13 +134,21 @@ body: |
; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptoui_s16_to_s32_vs
- ; GFX11: liveins: $sgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_vs
+ ; GFX11-TRUE16: liveins: $sgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_vs
+ ; GFX11-FAKE16: liveins: $sgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s16) = G_TRUNC %0
%2:vgpr(s32) = G_FPTOUI %1
@@ -167,15 +185,25 @@ body: |
; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptoui_s16_to_s32_fneg_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
- ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_fneg_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_fneg_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
+ ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_FNEG %1
@@ -209,13 +237,23 @@ body: |
; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptoui_s16_to_s1_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s32) = G_FPTOUI %1
@@ -249,13 +287,22 @@ body: |
; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptoui_s16_to_s1_vs
- ; GFX11: liveins: $sgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_vs
+ ; GFX11-TRUE16: liveins: $sgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY1]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_vs
+ ; GFX11-FAKE16: liveins: $sgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s16) = G_TRUNC %0
%2:vgpr(s32) = G_FPTOUI %1
@@ -293,15 +340,26 @@ body: |
; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
;
- ; GFX11-LABEL: name: fptoui_s16_to_s1_fneg_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
- ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
- ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
+ ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_fneg_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_fneg_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
+ ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_FNEG %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
index 938bb58..3888ce8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
@@ -1,7 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
---
@@ -85,13 +86,23 @@ body: |
; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]]
;
- ; GFX11-LABEL: name: sitofp_s32_to_s16_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
+ ; GFX11-TRUE16-LABEL: name: sitofp_s32_to_s16_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: sitofp_s32_to_s16_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_SITOFP %0
%2:vgpr(s32) = G_ANYEXT %1
@@ -124,13 +135,23 @@ body: |
; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]]
;
- ; GFX11-LABEL: name: sitofp_s32_to_s16_vs
- ; GFX11: liveins: $sgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
+ ; GFX11-TRUE16-LABEL: name: sitofp_s32_to_s16_vs
+ ; GFX11-TRUE16: liveins: $sgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: sitofp_s32_to_s16_vs
+ ; GFX11-FAKE16: liveins: $sgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s16) = G_SITOFP %0
%2:vgpr(s32) = G_ANYEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
index 9c6fded..35d622d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
@@ -1,7 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
---
name: uitofp_s32_to_s32_vv
@@ -99,13 +100,23 @@ body: |
; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]]
;
- ; GFX11-LABEL: name: uitofp_s32_to_s16_vv
- ; GFX11: liveins: $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
+ ; GFX11-TRUE16-LABEL: name: uitofp_s32_to_s16_vv
+ ; GFX11-TRUE16: liveins: $vgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: uitofp_s32_to_s16_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_UITOFP %0
%2:vgpr(s32) = G_ANYEXT %1
@@ -138,13 +149,23 @@ body: |
; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]]
;
- ; GFX11-LABEL: name: uitofp_s32_to_s16_vs
- ; GFX11: liveins: $sgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
- ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
+ ; GFX11-TRUE16-LABEL: name: uitofp_s32_to_s16_vs
+ ; GFX11-TRUE16: liveins: $sgpr0
+ ; GFX11-TRUE16-NEXT: {{ $}}
+ ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: uitofp_s32_to_s16_vs
+ ; GFX11-FAKE16: liveins: $sgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s16) = G_UITOFP %0
%2:vgpr(s32) = G_ANYEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index 9d586e3..eeb7b13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX78,GFX7 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GFX78,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define i16 @v_powi_f16(i16 %l, i32 %r) {
; GFX7-LABEL: v_powi_f16:
@@ -36,21 +37,37 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
; GFX8-NEXT: v_exp_f16_e32 v0, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_powi_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_powi_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_powi_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%l.cast = bitcast i16 %l to half
%res = call half @llvm.powi.f16.i32(half %l.cast, i32 %r)
%res.cast = bitcast half %res to i16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir
new file mode 100644
index 0000000..f87a253
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: replaceRegWith_requires_copy
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: replaceRegWith_requires_copy
+ ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sreg_32(s32) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY [[ICMP]](s32)
+ ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(p1) = COPY $vgpr0_vgpr1
+ %1:sgpr(s32) = COPY $sgpr0
+ %2:sgpr(s32) = G_CONSTANT i32 1
+ %3:sreg_32(s32) = G_ICMP intpred(ne), %1, %2
+ %4:sgpr(s32) = G_AND %3, %2
+ G_STORE %4(s32), %0(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 1151bde..41b61f2 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -82,9 +82,9 @@ body: |
# Regression test for src_modifiers on base u16 opcode
# GCN-LABEL: name: vop3_u16
-# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
+# GCN: %5:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %7:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %9:vgpr_32 = V_ADD_NC_U16_fake16_e64 4, %8, 8, %7, 0, 0, implicit $exec
name: vop3_u16
tracksRegLiveness: true
body: |
@@ -96,11 +96,11 @@ body: |
%2:vgpr_32 = COPY $vgpr2
%3:vgpr_32 = IMPLICIT_DEF
%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
- %5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec
+ %5:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, %4, 0, %3, 0, 0, implicit $exec
%6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec
- %7:vgpr_32 = V_ADD_NC_U16_e64 1, %6, 2, %5, 0, 0, implicit $exec
+ %7:vgpr_32 = V_ADD_NC_U16_fake16_e64 1, %6, 2, %5, 0, 0, implicit $exec
%8:vgpr_32 = V_MOV_B32_dpp %3, %7, 1, 15, 15, 1, implicit $exec
- %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
+ %9:vgpr_32 = V_ADD_NC_U16_fake16_e64 4, %8, 8, %7, 0, 0, implicit $exec
...
name: vop3p
@@ -880,11 +880,11 @@ body: |
# Check op_sel is all 0s when combining
# GCN-LABEL: name: opsel_vop3
-# GCN: %4:vgpr_32 = V_ADD_I16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
-# GCN: %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
-# GCN: %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
-# GCN: %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec
+# GCN: %4:vgpr_32 = V_ADD_I16_fake16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %6:vgpr_32 = V_ADD_I16_fake16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+# GCN: %8:vgpr_32 = V_ADD_I16_fake16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+# GCN: %10:vgpr_32 = V_ADD_I16_fake16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+# GCN: %12:vgpr_32 = V_ADD_I16_fake16_e64 8, %11, 0, %1, 0, 0, implicit $exec
name: opsel_vop3
tracksRegLiveness: true
body: |
@@ -897,23 +897,23 @@ body: |
; Combine for op_sel:[0,0,0]
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
- %4:vgpr_32 = V_ADD_I16_e64 0, %3, 0, %1, 0, 0, implicit $exec
+ %4:vgpr_32 = V_ADD_I16_fake16_e64 0, %3, 0, %1, 0, 0, implicit $exec
; Do not combine for op_sel:[1,0,0]
%5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
- %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+ %6:vgpr_32 = V_ADD_I16_fake16_e64 4, %5, 0, %1, 0, 0, implicit $exec
; Do not combine for op_sel:[0,1,0]
%7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
- %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+ %8:vgpr_32 = V_ADD_I16_fake16_e64 0, %7, 4, %1, 0, 0, implicit $exec
; Do not combine for op_sel:[1,1,0]
%9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
- %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+ %10:vgpr_32 = V_ADD_I16_fake16_e64 4, %9, 4, %1, 0, 0, implicit $exec
; Do not combine for op_sel:[0,0,1] (dst_op_sel only)
%11:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
- %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec
+ %12:vgpr_32 = V_ADD_I16_fake16_e64 8, %11, 0, %1, 0, 0, implicit $exec
...
# Check op_sel is all 0s and op_sel_hi is all 1s when combining
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
index 265bdd0..30a24c67 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
@@ -1,6 +1,29 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
+# V_CVT_LT_F16 will be replaced with fake16 when its true16/fake16 profile is corrected
+
+---
+name: cmp_f16
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: cmp_f16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F16_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_fake16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_fake16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:vgpr_32 = V_CVT_F16_U16_fake16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3:sreg_32 = COPY %2:vgpr_32
+ nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
+ %4:sreg_32_xm0_xexec = COPY $scc
+ %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+...
+
+# Needs extra shift instruction to select hi 16 bits
---
name: cvt_hi_f32_f16
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 03a77dc..4604518 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -1,20 +1,39 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
-# XFAIL: *
-# FIXME-TRUE16. reenable after CVT_F16_U16_t16 is supported in CodeGen
+#
+
+---
+name: cmp_f16
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: cmp_f16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
+ ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[COPY]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
+ %0:vgpr_16 = IMPLICIT_DEF
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+ %3:sreg_32 = COPY %2:vgpr_16
+ nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
+ %4:sreg_32_xm0_xexec = COPY $scc
+ %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+...
---
name: cvt_hi_f32_f16
body: |
bb.0:
; GCN-LABEL: name: cvt_hi_f32_f16
- ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CVT_F16_U16_e64_]], implicit $exec
- ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[V_LSHRREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_16 = IMPLICIT_DEF
- %1:vgpr_16 = V_CVT_F16_U16_t16_e64 %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+ %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
%2:sreg_32 = COPY %1:vgpr_16
%3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
...
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 9a727a3..e8291f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -3,26 +3,6 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
---
-name: cmp_f16
-body: |
- bb.0.entry:
- ; GCN-LABEL: name: cmp_f16
- ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
- %0:vgpr_32 = IMPLICIT_DEF
- %1:sreg_32 = IMPLICIT_DEF
- %2:vgpr_32 = V_CVT_F16_U16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
- %3:sreg_32 = COPY %2:vgpr_32
- nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
- %4:sreg_32_xm0_xexec = COPY $scc
- %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
-...
-
----
name: fmac_f16
body: |
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index 81859dc..064e888 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s
; RUN: llc -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefixes=CAYMAN %s
@@ -44,25 +45,45 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
-; GFX11-LABEL: test_convert_fp16_to_fp32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
;
; CYPRESS-LABEL: test_convert_fp16_to_fp32:
; CYPRESS: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index c17be87..6c9f451 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+
declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
@@ -44,27 +46,49 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
-; GFX11-LABEL: test_convert_fp16_to_fp64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load i16, ptr addrspace(1) %in, align 2
%cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
store double %cvt, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index d8a726f..5bac710 100644
--- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s
declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
@@ -43,25 +44,45 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
; GFX8-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
-; GFX11-LABEL: test_convert_fp32_to_fp16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: test_convert_fp32_to_fp16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_convert_fp32_to_fp16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
;
; CYPRESS-LABEL: test_convert_fp32_to_fp16:
; CYPRESS: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 75f4dff..a40d678 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @fpext_f16_to_f32(
; SI-LABEL: fpext_f16_to_f32:
@@ -59,25 +60,45 @@ define amdgpu_kernel void @fpext_f16_to_f32(
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) #0 {
entry:
@@ -145,27 +166,49 @@ define amdgpu_kernel void @fpext_f16_to_f64(
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fpext_f16_to_f64:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fpext_f16_to_f64:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fpext_f16_to_f64:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) #0 {
entry:
@@ -234,28 +277,51 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fpext_v2f16_to_v2f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) #0 {
entry:
@@ -330,31 +396,57 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fpext_v2f16_to_v2f64:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f64:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -387,19 +479,35 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a)
; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fneg_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s4
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%a.trunc = trunc i32 %a to i16
%a.val = bitcast i16 %a.trunc to half
@@ -463,25 +571,45 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fneg_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fneg_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -547,25 +675,45 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fabs_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fabs_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l|
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -631,25 +779,45 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fneg_fabs_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l|
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -730,29 +898,55 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fneg_multi_use_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -v0
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -833,29 +1027,55 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
-; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -v0
+; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -935,29 +1155,55 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fabs_multi_use_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l|
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, |v0|
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -1038,29 +1284,55 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
-; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, |v0.l|, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, |v0.l|
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, |v0|
+; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, |v0|, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -1140,29 +1412,55 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
-; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l|
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -1244,29 +1542,55 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
-; GFX11-NEXT: v_mul_f16_e64 v0, -|v0|, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -|v0.l|, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0.l|
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
+; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, -|v0|, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 6e8e6c0..786fe03 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -136,12 +136,12 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v5, v1
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5
-; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 20, v5
; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff
; GISEL-NEXT: s_mov_b64 s[4:5], 0
-; GISEL-NEXT: v_mov_b32_e32 v1, 0
; GISEL-NEXT: v_mov_b32_e32 v7, 0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v2
; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
; GISEL-NEXT: v_mov_b32_e32 v0, s4
@@ -508,12 +508,12 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v5, v1
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5
-; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 20, v5
; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff
; GISEL-NEXT: s_mov_b64 s[4:5], 0
-; GISEL-NEXT: v_mov_b32_e32 v1, 0
; GISEL-NEXT: v_mov_b32_e32 v7, 0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v2
; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
; GISEL-NEXT: v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 0e12cca..327f265 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+
define amdgpu_kernel void @fptosi_f16_to_i16(
; SI-LABEL: fptosi_f16_to_i16:
@@ -41,25 +43,45 @@ define amdgpu_kernel void @fptosi_f16_to_i16(
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptosi_f16_to_i16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_f16_to_i16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_f16_to_i16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -108,27 +130,49 @@ define amdgpu_kernel void @fptosi_f16_to_i32(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptosi_f16_to_i32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_f16_to_i32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_f16_to_i32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -182,28 +226,51 @@ define amdgpu_kernel void @fptosi_f16_to_i64(
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptosi_f16_to_i64:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_f16_to_i64:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_f16_to_i64:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -259,31 +326,60 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptosi_v2f16_to_v2i16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_i16_f16_e32 v1, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -337,31 +433,57 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptosi_v2f16_to_v2i32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -422,34 +544,63 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptosi_v2f16_to_v2i64:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v1
-; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i64:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v1
+; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -485,21 +636,38 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptosi_f16_to_i1:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_f16_to_i1:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_f16_to_i1:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%conv = fptosi half %in to i1
store i1 %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index abc5c7a..ba540f4 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+
define amdgpu_kernel void @fptoui_f16_to_i16(
; SI-LABEL: fptoui_f16_to_i16:
@@ -41,25 +43,45 @@ define amdgpu_kernel void @fptoui_f16_to_i16(
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptoui_f16_to_i16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_f16_to_i16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_f16_to_i16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -108,27 +130,49 @@ define amdgpu_kernel void @fptoui_f16_to_i32(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptoui_f16_to_i32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_f16_to_i32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_f16_to_i32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -182,28 +226,51 @@ define amdgpu_kernel void @fptoui_f16_to_i64(
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptoui_f16_to_i64:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_f16_to_i64:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_f16_to_i64:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -258,31 +325,60 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptoui_v2f16_to_v2i16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_u16_f16_e32 v1, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -336,31 +432,57 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptoui_v2f16_to_v2i32:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i32:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -421,33 +543,61 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptoui_v2f16_to_v2i64:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i64:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -484,21 +634,38 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: fptoui_f16_to_i1:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_f16_to_i1:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_f16_to_i1:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%conv = fptoui half %in to i1
store i1 %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index c62b4e5..2e2a109 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -996,7 +996,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GISEL-GFX11-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GISEL-GFX11-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; GISEL-GFX11-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX11-NEXT: S_ENDPGM 0
@@ -1020,7 +1020,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; DAGISEL-GFX11-WF32-NEXT: {{ $}}
; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -1032,7 +1032,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; DAGISEL-GFX11-WF64-NEXT: {{ $}}
; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 38d928a..2999ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -673,38 +673,38 @@ define double @sitofp_i128_to_f64(i128 %x) {
; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3
; GISEL-NEXT: v_xor_b32_e32 v0, v6, v4
; GISEL-NEXT: v_xor_b32_e32 v1, v6, v5
-; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2
-; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3
-; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
-; GISEL-NEXT: v_ffbh_u32_e32 v5, v0
-; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GISEL-NEXT: v_ffbh_u32_e32 v4, v1
-; GISEL-NEXT: v_add_u32_e32 v5, 32, v5
-; GISEL-NEXT: v_ffbh_u32_e32 v7, v2
-; GISEL-NEXT: v_min_u32_e32 v4, v4, v5
-; GISEL-NEXT: v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT: v_xor_b32_e32 v4, v6, v2
+; GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
+; GISEL-NEXT: v_xor_b32_e32 v5, v6, v3
+; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v6, vcc
+; GISEL-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v6, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v1, v2
+; GISEL-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v0, v3
+; GISEL-NEXT: v_add_u32_e32 v1, 32, v1
+; GISEL-NEXT: v_ffbh_u32_e32 v7, v4
+; GISEL-NEXT: v_min_u32_e32 v0, v0, v1
+; GISEL-NEXT: v_ffbh_u32_e32 v1, v5
; GISEL-NEXT: v_add_u32_e32 v7, 32, v7
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GISEL-NEXT: v_add_u32_e32 v4, 64, v4
-; GISEL-NEXT: v_min_u32_e32 v5, v5, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v5, v4, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT: v_add_u32_e32 v0, 64, v0
+; GISEL-NEXT: v_min_u32_e32 v1, v1, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc
; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v9
; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9
; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8
; GISEL-NEXT: ; implicit-def: $vgpr10
-; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GISEL-NEXT: ; %bb.2: ; %itofp-if-else
-; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v9
-; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT: v_add_u32_e32 v4, 0xffffffb5, v9
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v4, v[2:3]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc
; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; implicit-def: $vgpr9
; GISEL-NEXT: ; %bb.3: ; %Flow3
; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
@@ -721,89 +721,88 @@ define double @sitofp_i128_to_f64(i128 %x) {
; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default
; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v9
; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14
-; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1]
-; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], v14, v[2:3]
+; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5]
; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14
-; GISEL-NEXT: v_or_b32_e32 v10, v4, v10
-; GISEL-NEXT: v_or_b32_e32 v11, v5, v11
-; GISEL-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3]
-; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[2:3]
+; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[4:5]
+; GISEL-NEXT: v_or_b32_e32 v10, v0, v10
+; GISEL-NEXT: v_or_b32_e32 v11, v1, v11
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], v15, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT: v_add_u32_e32 v9, 55, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT: v_add_u32_e32 v14, 55, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v13, v4, v0, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc
-; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1
-; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1
-; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14
-; GISEL-NEXT: v_or_b32_e32 v16, v9, v11
-; GISEL-NEXT: v_or_b32_e32 v17, v10, v12
-; GISEL-NEXT: v_lshrrev_b64 v[11:12], v15, -1
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v16, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5]
-; GISEL-NEXT: v_and_b32_e32 v2, v9, v2
-; GISEL-NEXT: v_and_b32_e32 v3, v10, v3
-; GISEL-NEXT: v_and_or_b32 v0, v11, v0, v2
-; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc
+; GISEL-NEXT: v_sub_u32_e32 v12, 64, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v0, v2, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v1, v3, s[4:5]
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], v9, -1
+; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1
+; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v9
+; GISEL-NEXT: v_or_b32_e32 v16, v0, v12
+; GISEL-NEXT: v_or_b32_e32 v17, v1, v13
+; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v12, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_and_b32_e32 v1, v1, v5
+; GISEL-NEXT: v_and_or_b32 v0, v9, v2, v0
+; GISEL-NEXT: v_and_or_b32 v1, v12, v3, v1
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v3, v13, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mov_b32_e32 v1, v4
-; GISEL-NEXT: v_mov_b32_e32 v2, v5
-; GISEL-NEXT: v_mov_b32_e32 v3, v6
+; GISEL-NEXT: v_or_b32_e32 v9, v14, v0
+; GISEL-NEXT: v_mov_b32_e32 v2, v9
+; GISEL-NEXT: v_mov_b32_e32 v3, v10
+; GISEL-NEXT: v_mov_b32_e32 v4, v11
+; GISEL-NEXT: v_mov_b32_e32 v5, v12
; GISEL-NEXT: .LBB2_7: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: .LBB2_8: ; %Flow2
; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
; GISEL-NEXT: s_cbranch_execz .LBB2_10
; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb
-; GISEL-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1]
-; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1
-; GISEL-NEXT: v_or_b32_e32 v11, v2, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v9
-; GISEL-NEXT: v_mov_b32_e32 v1, v10
-; GISEL-NEXT: v_mov_b32_e32 v2, v11
-; GISEL-NEXT: v_mov_b32_e32 v3, v12
+; GISEL-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[2:3]
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 31, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v4, v2
+; GISEL-NEXT: v_mov_b32_e32 v5, v3
+; GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GISEL-NEXT: v_mov_b32_e32 v3, v1
+; GISEL-NEXT: v_mov_b32_e32 v2, v0
; GISEL-NEXT: .LBB2_10: ; %itofp-sw-epilog
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: v_bfe_u32 v3, v0, 2, 1
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
-; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
-; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT: v_bfe_u32 v0, v2, 2, 1
+; GISEL-NEXT: v_or_b32_e32 v0, v2, v0
+; GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], 2, v[2:3]
; GISEL-NEXT: v_mov_b32_e32 v9, 0
-; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v1
+; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v3
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GISEL-NEXT: v_lshl_or_b32 v10, v2, 30, v5
+; GISEL-NEXT: v_lshl_or_b32 v10, v4, 30, v1
; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20
-; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], 3, v[2:3]
; GISEL-NEXT: v_mov_b32_e32 v7, v8
-; GISEL-NEXT: v_lshl_or_b32 v10, v2, 29, v5
+; GISEL-NEXT: v_lshl_or_b32 v10, v4, 29, v1
; GISEL-NEXT: ; %bb.12: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GISEL-NEXT: .LBB2_13: ; %Flow4
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
-; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6
-; GISEL-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff
-; GISEL-NEXT: v_lshl_add_u32 v1, v7, 20, v1
-; GISEL-NEXT: v_and_or_b32 v2, v10, v2, v0
-; GISEL-NEXT: v_and_or_b32 v0, v4, -1, 0
-; GISEL-NEXT: v_or3_b32 v1, v2, v1, 0
+; GISEL-NEXT: v_and_b32_e32 v1, 0x80000000, v6
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x3ff00000
+; GISEL-NEXT: v_mov_b32_e32 v3, 0xfffff
+; GISEL-NEXT: v_lshl_add_u32 v2, v7, 20, v2
+; GISEL-NEXT: v_and_or_b32 v1, v10, v3, v1
+; GISEL-NEXT: v_or3_b32 v1, v1, v2, 0
; GISEL-NEXT: .LBB2_14: ; %Flow5
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1083,7 +1082,6 @@ define double @uitofp_i128_to_f64(i128 %x) {
; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000
; GISEL-NEXT: v_lshl_add_u32 v0, v6, 20, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9
-; GISEL-NEXT: v_and_or_b32 v4, v4, -1, 0
; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0
; GISEL-NEXT: .LBB3_14: ; %Flow5
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index b08a35a..9169433 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @sitofp_i16_to_f16(
; SI-LABEL: sitofp_i16_to_f16:
@@ -41,25 +42,45 @@ define amdgpu_kernel void @sitofp_i16_to_f16(
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: sitofp_i16_to_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: sitofp_i16_to_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: sitofp_i16_to_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -108,27 +129,49 @@ define amdgpu_kernel void @sitofp_i32_to_f16(
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: sitofp_i32_to_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: sitofp_i32_to_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: sitofp_i32_to_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -186,29 +229,56 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: sitofp_v2i16_to_v2f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f16_i16_e32 v1, v1
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: sitofp_v2i16_to_v2f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: sitofp_v2i16_to_v2f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -264,31 +334,60 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: sitofp_v2i32_to_v2f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: sitofp_v2i32_to_v2f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -353,37 +452,69 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: s_sint_to_fp_i1_to_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s6
-; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_sint_to_fp_i1_to_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s4
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s5
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s7
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
%a = load float, ptr addrspace(1) %in0
%b = load float, ptr addrspace(1) %in1
%acmp = fcmp oge float %a, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index c21ae43..c4268c1 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @uitofp_i16_to_f16(
; SI-LABEL: uitofp_i16_to_f16:
@@ -41,25 +42,45 @@ define amdgpu_kernel void @uitofp_i16_to_f16(
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: uitofp_i16_to_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: uitofp_i16_to_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: uitofp_i16_to_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -108,27 +129,49 @@ define amdgpu_kernel void @uitofp_i32_to_f16(
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: uitofp_i32_to_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: uitofp_i32_to_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: uitofp_i32_to_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -186,29 +229,56 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: uitofp_v2i16_to_v2f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: uitofp_v2i16_to_v2f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -264,31 +334,60 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: uitofp_v2i32_to_v2f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: uitofp_v2i32_to_v2f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -353,37 +452,69 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: s_uint_to_fp_i1_to_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s6
-; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_uint_to_fp_i1_to_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s4
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s5
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_nop 0
+; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s7
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_nop 0
+; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT: s_endpgm
%a = load float, ptr addrspace(1) %in0
%b = load float, ptr addrspace(1) %in1
%acmp = fcmp oge float %a, 0.000000e+00
diff --git a/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll b/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll
new file mode 100644
index 0000000..8c2a11a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+; Test that for vector values, WaveReadLaneAt scalarizes and maps down to the
+; DirectX op
+
+define noundef <2 x half> @wave_read_lane_v2half(<2 x half> noundef %expr, i32 %idx) {
+entry:
+; CHECK: call half @dx.op.waveReadLaneAt.f16(i32 117, half %expr.i0, i32 %idx)
+; CHECK: call half @dx.op.waveReadLaneAt.f16(i32 117, half %expr.i1, i32 %idx)
+ %ret = call <2 x half> @llvm.dx.wave.readlane.f16(<2 x half> %expr, i32 %idx)
+ ret <2 x half> %ret
+}
+
+define noundef <3 x i32> @wave_read_lane_v3i32(<3 x i32> noundef %expr, i32 %idx) {
+entry:
+; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i0, i32 %idx)
+; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i1, i32 %idx)
+; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i2, i32 %idx)
+ %ret = call <3 x i32> @llvm.dx.wave.readlane(<3 x i32> %expr, i32 %idx)
+ ret <3 x i32> %ret
+}
+
+define noundef <4 x double> @wave_read_lane_v4f64(<4 x double> noundef %expr, i32 %idx) {
+entry:
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i0, i32 %idx)
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i1, i32 %idx)
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i2, i32 %idx)
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i3, i32 %idx)
+ %ret = call <4 x double> @llvm.dx.wave.readlane(<4 x double> %expr, i32 %idx)
+ ret <4 x double> %ret
+}
+
+declare <2 x half> @llvm.dx.wave.readlane.v2f16(<2 x half>, i32)
+declare <3 x i32> @llvm.dx.wave.readlane.v3i32(<3 x i32>, i32)
+declare <4 x double> @llvm.dx.wave.readlane.v4f64(<4 x double>, i32)
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index a7dbc4c..0136942 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -220,6 +220,22 @@ entry:
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @memcpy_from_param_noalign(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
+; COMMON-NEXT: ret void
+;
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local void @memcpy_to_param(
; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -426,7 +442,7 @@ attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
!llvm.module.flags = !{!0, !1, !2, !3}
-!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !23}
!llvm.ident = !{!20, !21}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
@@ -451,3 +467,4 @@ attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
!19 = !{ptr @test_select_write, !"kernel", i32 1}
!20 = !{!"clang version 20.0.0git"}
!21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!23 = !{ptr @memcpy_from_param_noalign, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir b/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
index 99a3f80..794480b 100644
--- a/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
+++ b/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
@@ -1,6 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 -simplify-mir -verify-machineinstrs \
# RUN: -run-pass=early-ifcvt %s -o - | FileCheck %s
+# RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 -simplify-mir -verify-each \
+# RUN: -passes=early-ifcvt %s -o - | FileCheck %s
--- |
source_filename = "<stdin>"
diff --git a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
index e950c0a..2be370f 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
@@ -214,6 +214,48 @@ entry:
declare i64 @llvm.lround.i64.f64(double)
+define dso_local i32 @test_lroundi32f64(double %d) local_unnamed_addr {
+; BE-LABEL: test_lroundi32f64:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl lround
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_lroundi32f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mflr r0
+; CHECK-NEXT: stdu r1, -32(r1)
+; CHECK-NEXT: std r0, 48(r1)
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset lr, 16
+; CHECK-NEXT: bl lround
+; CHECK-NEXT: nop
+; CHECK-NEXT: addi r1, r1, 32
+; CHECK-NEXT: ld r0, 16(r1)
+; CHECK-NEXT: mtlr r0
+; CHECK-NEXT: blr
+;
+; FAST-LABEL: test_lroundi32f64:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: xsrdpi f0, f1
+; FAST-NEXT: fctiw f0, f0
+; FAST-NEXT: mffprwz r3, f0
+; FAST-NEXT: blr
+entry:
+ %0 = tail call i32 @llvm.lround.i32.f64(double %d)
+ ret i32 %0
+}
+
+declare i32 @llvm.lround.i32.f64(double)
+
define dso_local i64 @test_lroundf(float %f) local_unnamed_addr {
; BE-LABEL: test_lroundf:
; BE: # %bb.0: # %entry
@@ -256,6 +298,48 @@ entry:
declare i64 @llvm.lround.i64.f32(float)
+define dso_local i32 @test_lroundi32f32(float %d) local_unnamed_addr {
+; BE-LABEL: test_lroundi32f32:
+; BE: # %bb.0: # %entry
+; BE-NEXT: mflr r0
+; BE-NEXT: stdu r1, -112(r1)
+; BE-NEXT: std r0, 128(r1)
+; BE-NEXT: .cfi_def_cfa_offset 112
+; BE-NEXT: .cfi_offset lr, 16
+; BE-NEXT: bl lroundf
+; BE-NEXT: nop
+; BE-NEXT: addi r1, r1, 112
+; BE-NEXT: ld r0, 16(r1)
+; BE-NEXT: mtlr r0
+; BE-NEXT: blr
+;
+; CHECK-LABEL: test_lroundi32f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mflr r0
+; CHECK-NEXT: stdu r1, -32(r1)
+; CHECK-NEXT: std r0, 48(r1)
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset lr, 16
+; CHECK-NEXT: bl lroundf
+; CHECK-NEXT: nop
+; CHECK-NEXT: addi r1, r1, 32
+; CHECK-NEXT: ld r0, 16(r1)
+; CHECK-NEXT: mtlr r0
+; CHECK-NEXT: blr
+;
+; FAST-LABEL: test_lroundi32f32:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: xsrdpi f0, f1
+; FAST-NEXT: fctiw f0, f0
+; FAST-NEXT: mffprwz r3, f0
+; FAST-NEXT: blr
+entry:
+ %0 = tail call i32 @llvm.lround.i32.f32(float %d)
+ ret i32 %0
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+
define dso_local i64 @test_llround(double %d) local_unnamed_addr {
; BE-LABEL: test_llround:
; BE: # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index 8cb6fed..5460cae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zfh,+zvfh < %s | FileCheck %s
-; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zfh,+zvfh < %s | FileCheck %s
+; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s
+; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s
; Tests assume VLEN=128 or vscale_range_min=2.
@@ -1533,6 +1535,333 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_max(<vscale x 8 x i64> %a, <vsc
ret <vscale x 8 x i64> %res
}
+declare <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
+
+define <vscale x 1 x bfloat> @splice_nxv1bf16_offset_zero(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv1bf16_offset_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %res = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b, i32 0)
+ ret <vscale x 1 x bfloat> %res
+}
+
+define <vscale x 1 x bfloat> @splice_nxv1bf16_offset_negone(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv1bf16_offset_negone:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vslideup.vi v8, v9, 1
+; CHECK-NEXT: ret
+ %res = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b, i32 -1)
+ ret <vscale x 1 x bfloat> %res
+}
+
+define <vscale x 1 x bfloat> @splice_nxv1bf16_offset_min(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv1bf16_offset_min:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: addi a0, a0, -2
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: ret
+ %res = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b, i32 -2)
+ ret <vscale x 1 x bfloat> %res
+}
+
+define <vscale x 1 x bfloat> @splice_nxv1bf16_offset_max(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv1bf16_offset_max:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 1
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a0
+; CHECK-NEXT: ret
+ %res = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b, i32 1)
+ ret <vscale x 1 x bfloat> %res
+}
+
+declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_offset_zero(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_offset_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 0)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_offset_negone(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_offset_negone:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vslideup.vi v8, v9, 1
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -1)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_offset_min(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_offset_min:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: addi a0, a0, -4
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 4
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -4)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_offset_max(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_offset_max:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: addi a0, a0, -3
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 3
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a0
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 3)
+ ret <vscale x 2 x bfloat> %res
+}
+
+declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_offset_zero(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_offset_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 0)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_offset_negone(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_offset_negone:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vslideup.vi v8, v9, 1
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -1)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_offset_min(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_offset_min:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: addi a0, a0, -8
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 8
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -8)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_offset_max(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_offset_max:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: addi a0, a0, -7
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 7
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a0
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 7)
+ ret <vscale x 4 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_offset_zero(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_offset_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 0)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_offset_negone(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_offset_negone:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 1
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 -1)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_offset_min(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_offset_min:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 16
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 -16)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_offset_max(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_offset_max:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: addi a0, a0, -15
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 15
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a0
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 15)
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, i32)
+
+define <vscale x 16 x bfloat> @splice_nxv16bf16_offset_zero(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv16bf16_offset_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b, i32 0)
+ ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 16 x bfloat> @splice_nxv16bf16_offset_negone(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv16bf16_offset_negone:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 1
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b, i32 -1)
+ ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 16 x bfloat> @splice_nxv16bf16_offset_min(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv16bf16_offset_min:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: addi a0, a0, -32
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b, i32 -32)
+ ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 16 x bfloat> @splice_nxv16bf16_offset_max(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv16bf16_offset_max:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: addi a0, a0, -31
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 31
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v12, a0
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b, i32 31)
+ ret <vscale x 16 x bfloat> %res
+}
+
+declare <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i32)
+
+define <vscale x 32 x bfloat> @splice_nxv32bf16_offset_zero(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv32bf16_offset_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %res = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b, i32 0)
+ ret <vscale x 32 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @splice_nxv32bf16_offset_negone(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv32bf16_offset_negone:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v16, 1
+; CHECK-NEXT: ret
+ %res = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b, i32 -1)
+ ret <vscale x 32 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @splice_nxv32bf16_offset_min(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv32bf16_offset_min:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: li a1, 64
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v16, a1
+; CHECK-NEXT: ret
+ %res = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b, i32 -64)
+ ret <vscale x 32 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @splice_nxv32bf16_offset_max(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv32bf16_offset_max:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: addi a0, a0, -63
+; CHECK-NEXT: li a1, 63
+; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a1
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v16, a0
+; CHECK-NEXT: ret
+ %res = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b, i32 63)
+ ret <vscale x 32 x bfloat> %res
+}
+
declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
define <vscale x 1 x half> @splice_nxv1f16_offset_zero(<vscale x 1 x half> %a, <vscale x 1 x half> %b) #0 {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index a360ae1..11f603b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -1122,6 +1122,132 @@ define <vscale x 4 x i32> @vrem_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
ret <vscale x 4 x i32> %2
}
+define <vscale x 4 x i64> @vwmul_vv(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmul_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT: vwmul.vv v12, v8, v9
+; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT: vwmul.vv v8, v12, v12
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmul_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; VLOPT-NEXT: vwmul.vv v12, v8, v9
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vwmul.vv v8, v12, v12
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen -1)
+ %2 = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+ ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmul_vx(<vscale x 4 x i16> %a, i16 %b, i32 %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmul_vx:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a3, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT: vwmul.vx v12, v8, a0
+; NOVLOPT-NEXT: vsetvli zero, a2, e32, m2, ta, ma
+; NOVLOPT-NEXT: vwmul.vx v8, v12, a1
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmul_vx:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; VLOPT-NEXT: vwmul.vx v12, v8, a0
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vwmul.vx v8, v12, a1
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i32.nxv4i16.i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, i16 %b, iXLen -1)
+ %2 = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i64.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %1, i32 %c, iXLen %vl)
+ ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulsu_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT: vwmulsu.vv v12, v8, v10
+; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v12, v12
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmulsu_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT: vwmulsu.vv v12, v8, v10
+; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v12, v12
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+ %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+ ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulsu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulsu_vx:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT: vwmulsu.vx v12, v8, a0
+; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v12, v12
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmulsu_vx:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT: vwmulsu.vx v12, v8, a0
+; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v12, v12
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+ %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+ ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulu_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT: vwmulu.vv v12, v8, v10
+; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v12, v12
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmulu_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT: vwmulu.vv v12, v8, v10
+; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v12, v12
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+ %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+ ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulu_vx:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT: vwmulu.vx v12, v8, a0
+; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v12, v12
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmulu_vx:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT: vwmulu.vx v12, v8, a0
+; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v12, v12
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+ %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+ ret <vscale x 4 x i64> %2
+}
+
define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
; NOVLOPT-LABEL: vwmacc_vx:
; NOVLOPT: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
index 0b3e67e..1a1472f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
@@ -11,19 +11,46 @@
declare <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, iXLen)
define <vscale x 4 x i32> @different_imm_vl_with_ta(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
-; CHECK-LABEL: different_imm_vl_with_ta:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
-; CHECK-NEXT: vadd.vv v8, v10, v12
-; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT: vadd.vv v8, v8, v10
-; CHECK-NEXT: ret
+; NOVLOPT-LABEL: different_imm_vl_with_ta:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetivli zero, 5, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v10, v12
+; NOVLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v10
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: different_imm_vl_with_ta:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v10, v12
+; VLOPT-NEXT: vadd.vv v8, v8, v10
+; VLOPT-NEXT: ret
%v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 5)
%w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
ret <vscale x 4 x i32> %w
}
-; No benificial to propagate VL since VL is larger in the use side.
+define <vscale x 4 x i32> @vlmax_and_imm_vl_with_ta(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+; NOVLOPT-LABEL: vlmax_and_imm_vl_with_ta:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v10, v12
+; NOVLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v10
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vlmax_and_imm_vl_with_ta:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v10, v12
+; VLOPT-NEXT: vadd.vv v8, v8, v10
+; VLOPT-NEXT: ret
+ %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+ %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
+ ret <vscale x 4 x i32> %w
+}
+
+; Not beneficial to propagate VL since VL is larger in the use side.
define <vscale x 4 x i32> @different_imm_vl_with_ta_larger_vl(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
; CHECK-LABEL: different_imm_vl_with_ta_larger_vl:
; CHECK: # %bb.0:
@@ -50,8 +77,7 @@ define <vscale x 4 x i32> @different_imm_reg_vl_with_ta(<vscale x 4 x i32> %pass
ret <vscale x 4 x i32> %w
}
-
-; No benificial to propagate VL since VL is already one.
+; Not beneficial to propagate VL since VL is already one.
define <vscale x 4 x i32> @different_imm_vl_with_ta_1(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
; CHECK-LABEL: different_imm_vl_with_ta_1:
; CHECK: # %bb.0:
@@ -110,7 +136,3 @@ define <vscale x 4 x i32> @different_imm_vl_with_tu(<vscale x 4 x i32> %passthru
%w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen 4)
ret <vscale x 4 x i32> %w
}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; NOVLOPT: {{.*}}
-; VLOPT: {{.*}}
diff --git a/llvm/test/CodeGen/Thumb2/avoidmuls.mir b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
new file mode 100644
index 0000000..8d55674
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
@@ -0,0 +1,67 @@
+# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s
+
+--- |
+ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+ target triple = "thumbv8m.main-arm-none-eabi"
+
+ ; Function Attrs: norecurse nounwind readnone
+ define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
+ entry:
+ %cmp6 = icmp sgt i32 %y, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+ for.body.preheader: ; preds = %entry
+ br label %for.body
+
+ for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
+ ret i32 %sum.0.lcssa
+
+ for.body: ; preds = %for.body, %for.body.preheader
+ %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
+ %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
+ %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
+ %mul = mul nsw i32 %lsr.iv1, %sum.07
+ %lsr.iv.next = add i32 %lsr.iv, -1
+ %lsr.iv.next2 = add i32 %lsr.iv1, 1
+ %exitcond = icmp eq i32 %lsr.iv.next, 0
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+ }
+
+ attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m33" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name: test
+tracksRegLiveness: true
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+body: |
+ bb.0.entry:
+ successors: %bb.1.for.body, %bb.2.for.cond.cleanup
+ liveins: $r0, $r1
+
+ $r2 = tMOVr $r0, 14, _
+ $r0 = t2MOVi 1, 14, _, _
+ t2CMPri $r1, 1, 14, _, implicit-def $cpsr
+ t2Bcc %bb.2.for.cond.cleanup, 11, killed $cpsr
+
+ bb.1.for.body:
+ successors: %bb.2.for.cond.cleanup, %bb.1.for.body
+ liveins: $r0, $r1, $r2
+
+ $r0 = t2MUL $r2, killed $r0, 14, _
+ $r2 = t2ADDri killed $r2, 1, 14, _, _
+ $r1 = t2SUBri killed $r1, 1, 14, _, def $cpsr
+ t2Bcc %bb.1.for.body, 1, killed $cpsr
+
+ bb.2.for.cond.cleanup:
+ liveins: $r0
+
+ tBX_RET 14, _, implicit $r0
+
+...
+# CHECK-LABEL: test
+# CHECK: tMUL
+# CHECK-NOT: t2MUL
diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll
new file mode 100644
index 0000000..46ebe6b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/andnot-patterns.ll
@@ -0,0 +1,626 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -mattr=+bmi | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=X64
+
+; TODO - PR112425 - attempt to reconstruct andnot patterns through bitwise-agnostic operations
+
+declare void @use_i64(i64)
+
+;
+; Fold (and X, (rotl (not Y), Z))) -> (and X, (not (rotl Y, Z)))
+;
+
+define i64 @andnot_rotl_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
+; X86-LABEL: andnot_rotl_i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: notl %edx
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: jne .LBB0_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: jmp .LBB0_3
+; X86-NEXT: .LBB0_1:
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: .LBB0_3:
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotl_i64:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: notq %rax
+; X64-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NEXT: rolq %cl, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %not = xor i64 %a1, -1
+ %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a2)
+ %and = and i64 %rot, %a0
+ ret i64 %and
+}
+
+define i32 @andnot_rotl_i32(i32 %a0, i32 %a1, i32 %a2) nounwind {
+; X86-LABEL: andnot_rotl_i32:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: roll %cl, %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotl_i32:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: notl %eax
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: roll %cl, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %not = xor i32 %a1, -1
+ %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a2)
+ %and = and i32 %rot, %a0
+ ret i32 %and
+}
+
+define i16 @andnot_rotl_i16(i16 %a0, i16 %a1, i16 %a2) nounwind {
+; X86-LABEL: andnot_rotl_i16:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: rolw %cl, %ax
+; X86-NEXT: andw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotl_i16:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: notl %eax
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: rolw %cl, %ax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %not = xor i16 %a1, -1
+ %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a2)
+ %and = and i16 %rot, %a0
+ ret i16 %and
+}
+
+define i8 @andnot_rotl_i8(i8 %a0, i8 %a1, i8 %a2) nounwind {
+; X86-LABEL: andnot_rotl_i8:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notb %al
+; X86-NEXT: rolb %cl, %al
+; X86-NEXT: andb {{[0-9]+}}(%esp), %al
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotl_i8:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: notb %al
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: rolb %cl, %al
+; X64-NEXT: andb %dil, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+ %not = xor i8 %a1, -1
+ %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a2)
+ %and = and i8 %rot, %a0
+ ret i8 %and
+}
+
+define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1, i64 %a2) nounwind {
+; X86-LABEL: andnot_rotl_i64_multiuse:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: notl %edx
+; X86-NEXT: notl %esi
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: jne .LBB4_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: jmp .LBB4_3
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB4_3:
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: andl %eax, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll use_i64@PLT
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotl_i64_multiuse:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: notq %rsi
+; X64-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NEXT: rolq %cl, %rsi
+; X64-NEXT: andq %rsi, %rbx
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: callq use_i64@PLT
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: popq %rbx
+; X64-NEXT: retq
+ %not = xor i64 %a1, -1
+ %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a2)
+ %and = and i64 %rot, %a0
+ call void @use_i64(i64 %rot)
+ ret i64 %and
+}
+
+;
+; Fold (and X, (rotr (not Y), Z))) -> (and X, (not (rotr Y, Z)))
+;
+
+define i64 @andnot_rotr_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
+; X86-LABEL: andnot_rotr_i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: notl %esi
+; X86-NEXT: notl %edx
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB5_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: jmp .LBB5_3
+; X86-NEXT: .LBB5_1:
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: .LBB5_3:
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %esi, %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotr_i64:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: notq %rax
+; X64-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NEXT: rorq %cl, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %not = xor i64 %a1, -1
+ %rot = tail call i64 @llvm.fshr.i64(i64 %not, i64 %not, i64 %a2)
+ %and = and i64 %rot, %a0
+ ret i64 %and
+}
+
+define i32 @andnot_rotr_i32(i32 %a0, i32 %a1, i32 %a2) nounwind {
+; X86-LABEL: andnot_rotr_i32:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: rorl %cl, %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotr_i32:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: notl %eax
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: rorl %cl, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %not = xor i32 %a1, -1
+ %rot = tail call i32 @llvm.fshr.i32(i32 %not, i32 %not, i32 %a2)
+ %and = and i32 %rot, %a0
+ ret i32 %and
+}
+
+define i16 @andnot_rotr_i16(i16 %a0, i16 %a1, i16 %a2) nounwind {
+; X86-LABEL: andnot_rotr_i16:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: rorw %cl, %ax
+; X86-NEXT: andw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotr_i16:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: notl %eax
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: rorw %cl, %ax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %not = xor i16 %a1, -1
+ %rot = tail call i16 @llvm.fshr.i16(i16 %not, i16 %not, i16 %a2)
+ %and = and i16 %rot, %a0
+ ret i16 %and
+}
+
+define i8 @andnot_rotr_i8(i8 %a0, i8 %a1, i8 %a2) nounwind {
+; X86-LABEL: andnot_rotr_i8:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notb %al
+; X86-NEXT: rorb %cl, %al
+; X86-NEXT: andb {{[0-9]+}}(%esp), %al
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_rotr_i8:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: notb %al
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: rorb %cl, %al
+; X64-NEXT: andb %dil, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+ %not = xor i8 %a1, -1
+ %rot = tail call i8 @llvm.fshr.i8(i8 %not, i8 %not, i8 %a2)
+ %and = and i8 %rot, %a0
+ ret i8 %and
+}
+
+;
+; Fold (and X, (bswap (not Y)))) -> (and X, (not (bswap Y)))
+;
+
+define i64 @andnot_bswap_i64(i64 %a0, i64 %a1) nounwind {
+; X86-LABEL: andnot_bswap_i64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: notl %edx
+; X86-NEXT: bswapl %edx
+; X86-NEXT: bswapl %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_bswap_i64:
+; X64: # %bb.0:
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: notq %rax
+; X64-NEXT: bswapq %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %not = xor i64 %a1, -1
+ %bswap = tail call i64 @llvm.bswap.i64(i64 %not)
+ %and = and i64 %bswap, %a0
+ ret i64 %and
+}
+
+define i32 @andnot_bswap_i32(i32 %a0, i32 %a1) nounwind {
+; X86-LABEL: andnot_bswap_i32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: bswapl %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_bswap_i32:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: notl %eax
+; X64-NEXT: bswapl %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %not = xor i32 %a1, -1
+ %bswap = tail call i32 @llvm.bswap.i32(i32 %not)
+ %and = and i32 %bswap, %a0
+ ret i32 %and
+}
+
+define i16 @andnot_bswap_i16(i16 %a0, i16 %a1) nounwind {
+; X86-LABEL: andnot_bswap_i16:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: rolw $8, %ax
+; X86-NEXT: andw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_bswap_i16:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: notl %eax
+; X64-NEXT: rolw $8, %ax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %not = xor i16 %a1, -1
+ %bswap = tail call i16 @llvm.bswap.i16(i16 %not)
+ %and = and i16 %bswap, %a0
+ ret i16 %and
+}
+
+;
+; Fold (and X, (bitreverse (not Y)))) -> (and X, (not (bitreverse Y)))
+;
+
+define i64 @andnot_bitreverse_i64(i64 %a0, i64 %a1) nounwind {
+; X86-LABEL: andnot_bitreverse_i64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: notl %ecx
+; X86-NEXT: bswapl %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT: shll $4, %edx
+; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT: shrl $2, %ecx
+; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT: leal (%ecx,%edx,4), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT: shrl %ecx
+; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT: leal (%ecx,%edx,2), %edx
+; X86-NEXT: bswapl %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT: shll $4, %ecx
+; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT: leal (%eax,%ecx,4), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT: leal (%eax,%ecx,2), %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_bitreverse_i64:
+; X64: # %bb.0:
+; X64-NEXT: notq %rsi
+; X64-NEXT: bswapq %rsi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: shrq $4, %rax
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: andq %rcx, %rsi
+; X64-NEXT: shlq $4, %rsi
+; X64-NEXT: orq %rax, %rsi
+; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: shrq $2, %rsi
+; X64-NEXT: andq %rax, %rsi
+; X64-NEXT: leaq (%rsi,%rcx,4), %rax
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: andq %rcx, %rdx
+; X64-NEXT: shrq %rax
+; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: leaq (%rax,%rdx,2), %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %not = xor i64 %a1, -1
+ %bitrev = tail call i64 @llvm.bitreverse.i64(i64 %not)
+ %and = and i64 %bitrev, %a0
+ ret i64 %and
+}
+
+define i32 @andnot_bitreverse_i32(i32 %a0, i32 %a1) nounwind {
+; X86-LABEL: andnot_bitreverse_i32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: bswapl %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT: shll $4, %ecx
+; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT: leal (%eax,%ecx,4), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT: shrl %eax
+; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT: leal (%eax,%ecx,2), %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_bitreverse_i32:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: notl %esi
+; X64-NEXT: bswapl %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT: shll $4, %eax
+; X64-NEXT: shrl $4, %esi
+; X64-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F
+; X64-NEXT: orl %eax, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: shrl $2, %esi
+; X64-NEXT: andl $858993459, %esi # imm = 0x33333333
+; X64-NEXT: leal (%rsi,%rax,4), %eax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555
+; X64-NEXT: shrl %eax
+; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT: leal (%rax,%rcx,2), %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %not = xor i32 %a1, -1
+ %bitrev = tail call i32 @llvm.bitreverse.i32(i32 %not)
+ %and = and i32 %bitrev, %a0
+ ret i32 %and
+}
+
+define i16 @andnot_bitreverse_i16(i16 %a0, i16 %a1) nounwind {
+; X86-LABEL: andnot_bitreverse_i16:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: rolw $8, %ax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
+; X86-NEXT: shll $4, %ecx
+; X86-NEXT: shrl $4, %eax
+; X86-NEXT: andl $3855, %eax # imm = 0xF0F
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $13107, %ecx # imm = 0x3333
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: andl $13107, %eax # imm = 0x3333
+; X86-NEXT: leal (%eax,%ecx,4), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $21845, %ecx # imm = 0x5555
+; X86-NEXT: shrl %eax
+; X86-NEXT: andl $21845, %eax # imm = 0x5555
+; X86-NEXT: leal (%eax,%ecx,2), %eax
+; X86-NEXT: andw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_bitreverse_i16:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: notl %esi
+; X64-NEXT: rolw $8, %si
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $3855, %eax # imm = 0xF0F
+; X64-NEXT: shll $4, %eax
+; X64-NEXT: shrl $4, %esi
+; X64-NEXT: andl $3855, %esi # imm = 0xF0F
+; X64-NEXT: orl %eax, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $13107, %eax # imm = 0x3333
+; X64-NEXT: shrl $2, %esi
+; X64-NEXT: andl $13107, %esi # imm = 0x3333
+; X64-NEXT: leal (%rsi,%rax,4), %eax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andl $21845, %ecx # imm = 0x5555
+; X64-NEXT: shrl %eax
+; X64-NEXT: andl $21845, %eax # imm = 0x5555
+; X64-NEXT: leal (%rax,%rcx,2), %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %not = xor i16 %a1, -1
+ %bitrev = tail call i16 @llvm.bitreverse.i16(i16 %not)
+ %and = and i16 %bitrev, %a0
+ ret i16 %and
+}
+
+define i8 @andnot_bitreverse_i8(i8 %a0, i8 %a1) nounwind {
+; X86-LABEL: andnot_bitreverse_i8:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notb %al
+; X86-NEXT: rolb $4, %al
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andb $51, %cl
+; X86-NEXT: shlb $2, %cl
+; X86-NEXT: shrb $2, %al
+; X86-NEXT: andb $51, %al
+; X86-NEXT: orb %cl, %al
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andb $85, %cl
+; X86-NEXT: addb %cl, %cl
+; X86-NEXT: shrb %al
+; X86-NEXT: andb $85, %al
+; X86-NEXT: orb %cl, %al
+; X86-NEXT: andb {{[0-9]+}}(%esp), %al
+; X86-NEXT: retl
+;
+; X64-LABEL: andnot_bitreverse_i8:
+; X64: # %bb.0:
+; X64-NEXT: notb %sil
+; X64-NEXT: rolb $4, %sil
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andb $51, %al
+; X64-NEXT: shlb $2, %al
+; X64-NEXT: shrb $2, %sil
+; X64-NEXT: andb $51, %sil
+; X64-NEXT: orb %sil, %al
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andb $85, %cl
+; X64-NEXT: addb %cl, %cl
+; X64-NEXT: shrb %al
+; X64-NEXT: andb $85, %al
+; X64-NEXT: orb %cl, %al
+; X64-NEXT: andb %dil, %al
+; X64-NEXT: retq
+ %not = xor i8 %a1, -1
+ %bitrev = tail call i8 @llvm.bitreverse.i8(i8 %not)
+ %and = and i8 %bitrev, %a0
+ ret i8 %and
+}
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 8e42466..2b392e6 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -974,7 +974,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1))
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index bb87252..3577f25 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -2962,6 +2962,64 @@ entry:
ret double %result
}
+; Verify that atan2(42.1, 3.0) isn't simplified when the rounding mode is unknown.
+define double @fatan2() #0 {
+; X87-LABEL: fatan2:
+; X87: # %bb.0: # %entry
+; X87-NEXT: subl $28, %esp
+; X87-NEXT: .cfi_def_cfa_offset 32
+; X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X87-NEXT: fstpl {{[0-9]+}}(%esp)
+; X87-NEXT: fldl {{\.?LCPI[0-9]+_[0-9]+}}
+; X87-NEXT: fstpl (%esp)
+; X87-NEXT: wait
+; X87-NEXT: calll atan2
+; X87-NEXT: addl $28, %esp
+; X87-NEXT: .cfi_def_cfa_offset 4
+; X87-NEXT: retl
+;
+; X86-SSE-LABEL: fatan2:
+; X86-SSE: # %bb.0: # %entry
+; X86-SSE-NEXT: subl $28, %esp
+; X86-SSE-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; X86-SSE-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; X86-SSE-NEXT: movsd %xmm0, (%esp)
+; X86-SSE-NEXT: calll atan2
+; X86-SSE-NEXT: addl $28, %esp
+; X86-SSE-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE-NEXT: retl
+;
+; SSE-LABEL: fatan2:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; SSE-NEXT: movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
+; SSE-NEXT: callq atan2@PLT
+; SSE-NEXT: popq %rax
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: retq
+;
+; AVX-LABEL: fatan2:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %rax
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: popq %rax
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %result = call double @llvm.experimental.constrained.atan2.f64(double 42.1,
+ double 3.0,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
; Verify that cosh(42.0) isn't simplified when the rounding mode is unknown.
define double @fcosh() #0 {
; X87-LABEL: fcosh:
@@ -3132,6 +3190,7 @@ declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata
declare double @llvm.experimental.constrained.asin.f64(double, metadata, metadata)
declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadata)
declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata)
declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata)
declare double @llvm.experimental.constrained.cosh.f64(double, metadata, metadata)
declare double @llvm.experimental.constrained.tanh.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 84574e3..ffaa9f6 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -1247,6 +1247,50 @@ entry:
ret fp128 %atan
}
+define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
+; ANDROID-LABEL: atan2:
+; ANDROID: # %bb.0: # %entry
+; ANDROID-NEXT: pushq %rax
+; ANDROID-NEXT: callq atan2l@PLT
+; ANDROID-NEXT: popq %rax
+; ANDROID-NEXT: retq
+;
+; GNU-LABEL: atan2:
+; GNU: # %bb.0: # %entry
+; GNU-NEXT: pushq %rax
+; GNU-NEXT: callq atan2f128@PLT
+; GNU-NEXT: popq %rax
+; GNU-NEXT: retq
+;
+; X86-LABEL: atan2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll atan2l
+; X86-NEXT: addl $44, %esp
+; X86-NEXT: movaps (%esp), %xmm0
+; X86-NEXT: movaps %xmm0, (%esi)
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl $24, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: retl $4
+entry:
+ %atan2 = call fp128 @llvm.experimental.constrained.atan2.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret fp128 %atan2
+}
+
define fp128 @tan(fp128 %x) nounwind strictfp {
; ANDROID-LABEL: tan:
; ANDROID: # %bb.0: # %entry
@@ -1948,6 +1992,7 @@ declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.sinh.f128(fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.atan.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.atan2.f128(fp128, fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.tanh.f128(fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata)
diff --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
index 293133b0..8bbc624 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
@@ -629,6 +629,35 @@ entry:
ret x86_fp80 %atan
}
+define x86_fp80 @atan2(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: atan2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll atan2l
+; X86-NEXT: addl $24, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: atan2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq atan2l@PLT
+; X64-NEXT: addq $40, %rsp
+; X64-NEXT: retq
+entry:
+ %atan2 = call x86_fp80 @llvm.experimental.constrained.atan2.f80(x86_fp80 %x, x86_fp80 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %atan2
+}
+
define x86_fp80 @tan(x86_fp80 %x) nounwind strictfp {
; X86-LABEL: tan:
; X86: # %bb.0: # %entry
@@ -830,6 +859,7 @@ declare x86_fp80 @llvm.experimental.constrained.asin.f80(x86_fp80, metadata, met
declare x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80, metadata, metadata)
declare x86_fp80 @llvm.experimental.constrained.sinh.f80(x86_fp80, metadata, metadata)
declare x86_fp80 @llvm.experimental.constrained.atan.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.atan2.f80(x86_fp80, x86_fp80, metadata, metadata)
declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata)
declare x86_fp80 @llvm.experimental.constrained.tanh.f80(x86_fp80, metadata, metadata)
declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
diff --git a/llvm/test/CodeGen/X86/llvm.atan2.ll b/llvm/test/CodeGen/X86/llvm.atan2.ll
new file mode 100644
index 0000000..ef2e4be
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llvm.atan2.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define half @use_atan2f16(half %a, half %b) nounwind {
+; CHECK-LABEL: use_atan2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: callq __extendhfsf2@PLT
+; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq __extendhfsf2@PLT
+; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq atan2f@PLT
+; CHECK-NEXT: callq __truncsfhf2@PLT
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %x = call half @llvm.atan2.f16(half %a, half %b)
+ ret half %x
+}
+
+define float @use_atan2f32(float %a, float %b) nounwind {
+; CHECK-LABEL: use_atan2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: jmp atan2f@PLT # TAILCALL
+ %x = call float @llvm.atan2.f32(float %a, float %b)
+ ret float %x
+}
+
+define double @use_atan2f64(double %a, double %b) nounwind {
+; CHECK-LABEL: use_atan2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: jmp atan2@PLT # TAILCALL
+ %x = call double @llvm.atan2.f64(double %a, double %b)
+ ret double %x
+}
+
+define x86_fp80 @use_atan2f80(x86_fp80 %a, x86_fp80 %b) nounwind {
+; CHECK-LABEL: use_atan2f80:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $40, %rsp
+; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp)
+; CHECK-NEXT: fstpt (%rsp)
+; CHECK-NEXT: callq atan2l@PLT
+; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: retq
+ %x = call x86_fp80 @llvm.atan2.f80(x86_fp80 %a, x86_fp80 %b)
+ ret x86_fp80 %x
+}
+
+define fp128 @use_atan2fp128(fp128 %a, fp128 %b) nounwind {
+; CHECK-LABEL: use_atan2fp128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: jmp atan2f128@PLT # TAILCALL
+ %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
+ ret fp128 %x
+}
+
+define ppc_fp128 @use_atan2ppc_fp128(ppc_fp128 %a, ppc_fp128 %b) nounwind {
+; CHECK-LABEL: use_atan2ppc_fp128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq atan2l@PLT
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %x = call ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128 %a, ppc_fp128 %b)
+ ret ppc_fp128 %x
+}
+
+declare half @llvm.atan2.f16(half, half)
+declare float @llvm.atan2.f32(float, float)
+declare double @llvm.atan2.f64(double, double)
+declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80)
+declare fp128 @llvm.atan2.f128(fp128, fp128)
+declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128, ppc_fp128)
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f4a0207..1e56f34 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -6403,7 +6403,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 487f729..da4432b 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -7298,7 +7298,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 498f250..1597e13 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -7148,7 +7148,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr108731.ll b/llvm/test/CodeGen/X86/pr108731.ll
index 87dce03..473b4f7 100644
--- a/llvm/test/CodeGen/X86/pr108731.ll
+++ b/llvm/test/CodeGen/X86/pr108731.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,NOBMI
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,BMI
-define i64 @foo(i64 %w, i64 %x, i64 %y, i64 %z) {
-; NOBMI-LABEL: foo:
+define i64 @test_i64(i64 %w, i64 %x, i64 %y, i64 %z) {
+; NOBMI-LABEL: test_i64:
; NOBMI: # %bb.0: # %Entry
; NOBMI-NEXT: movq %rcx, %rax
; NOBMI-NEXT: andq %rdx, %rsi
@@ -14,7 +14,7 @@ define i64 @foo(i64 %w, i64 %x, i64 %y, i64 %z) {
; NOBMI-NEXT: andq %rsi, %rax
; NOBMI-NEXT: retq
;
-; BMI-LABEL: foo:
+; BMI-LABEL: test_i64:
; BMI: # %bb.0: # %Entry
; BMI-NEXT: andq %rdx, %rsi
; BMI-NEXT: andnq %rdi, %rsi, %rax
@@ -31,8 +31,91 @@ Entry:
ret i64 %and3
}
-define <16 x i8> @fooVec(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) {
-; NOBMI-LABEL: fooVec:
+define i32 @test_i32(i32 %w, i32 %x, i32 %y, i32 %z) {
+; NOBMI-LABEL: test_i32:
+; NOBMI: # %bb.0: # %Entry
+; NOBMI-NEXT: movl %ecx, %eax
+; NOBMI-NEXT: andl %edx, %esi
+; NOBMI-NEXT: notl %esi
+; NOBMI-NEXT: andl %edi, %esi
+; NOBMI-NEXT: notl %eax
+; NOBMI-NEXT: orl %edx, %eax
+; NOBMI-NEXT: andl %esi, %eax
+; NOBMI-NEXT: retq
+;
+; BMI-LABEL: test_i32:
+; BMI: # %bb.0: # %Entry
+; BMI-NEXT: andl %edx, %esi
+; BMI-NEXT: andnl %edi, %esi, %eax
+; BMI-NEXT: andnl %ecx, %edx, %ecx
+; BMI-NEXT: andnl %eax, %ecx, %eax
+; BMI-NEXT: retq
+Entry:
+ %and1 = and i32 %y, %x
+ %xor1 = xor i32 %and1, -1
+ %and2 = and i32 %xor1, %w
+ %.not = xor i32 %z, -1
+ %or1 = or i32 %.not, %y
+ %and3 = and i32 %and2, %or1
+ ret i32 %and3
+}
+
+define i16 @test_i16(i16 %w, i16 %x, i16 %y, i16 %z) {
+; NOBMI-LABEL: test_i16:
+; NOBMI: # %bb.0: # %Entry
+; NOBMI-NEXT: movl %ecx, %eax
+; NOBMI-NEXT: andl %edx, %esi
+; NOBMI-NEXT: notl %esi
+; NOBMI-NEXT: andl %edi, %esi
+; NOBMI-NEXT: notl %eax
+; NOBMI-NEXT: orl %edx, %eax
+; NOBMI-NEXT: andl %esi, %eax
+; NOBMI-NEXT: # kill: def $ax killed $ax killed $eax
+; NOBMI-NEXT: retq
+;
+; BMI-LABEL: test_i16:
+; BMI: # %bb.0: # %Entry
+; BMI-NEXT: andl %edx, %esi
+; BMI-NEXT: andnl %edi, %esi, %eax
+; BMI-NEXT: notl %ecx
+; BMI-NEXT: orl %edx, %ecx
+; BMI-NEXT: andl %ecx, %eax
+; BMI-NEXT: # kill: def $ax killed $ax killed $eax
+; BMI-NEXT: retq
+Entry:
+ %and1 = and i16 %y, %x
+ %xor1 = xor i16 %and1, -1
+ %and2 = and i16 %xor1, %w
+ %.not = xor i16 %z, -1
+ %or1 = or i16 %.not, %y
+ %and3 = and i16 %and2, %or1
+ ret i16 %and3
+}
+
+define i8 @test_i8(i8 %w, i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: test_i8:
+; CHECK: # %bb.0: # %Entry
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: andl %edx, %esi
+; CHECK-NEXT: notb %sil
+; CHECK-NEXT: andb %dil, %sil
+; CHECK-NEXT: notb %cl
+; CHECK-NEXT: orb %cl, %al
+; CHECK-NEXT: andb %sil, %al
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+Entry:
+ %and1 = and i8 %y, %x
+ %xor1 = xor i8 %and1, -1
+ %and2 = and i8 %xor1, %w
+ %.not = xor i8 %z, -1
+ %or1 = or i8 %.not, %y
+ %and3 = and i8 %and2, %or1
+ ret i8 %and3
+}
+
+define <16 x i8> @test_v16i8(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) {
+; NOBMI-LABEL: test_v16i8:
; NOBMI: # %bb.0: # %Entry
; NOBMI-NEXT: andps %xmm2, %xmm1
; NOBMI-NEXT: andnps %xmm0, %xmm1
@@ -41,7 +124,7 @@ define <16 x i8> @fooVec(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z)
; NOBMI-NEXT: movaps %xmm2, %xmm0
; NOBMI-NEXT: retq
;
-; BMI-LABEL: fooVec:
+; BMI-LABEL: test_v16i8:
; BMI: # %bb.0: # %Entry
; BMI-NEXT: vandps %xmm1, %xmm2, %xmm1
; BMI-NEXT: vandnps %xmm0, %xmm1, %xmm0
@@ -58,6 +141,38 @@ Entry:
ret <16 x i8> %and3
}
+define <32 x i8> @test_v32i8(<32 x i8> %w, <32 x i8> %x, <32 x i8> %y, <32 x i8> %z) {
+; NOBMI-LABEL: test_v32i8:
+; NOBMI: # %bb.0: # %Entry
+; NOBMI-NEXT: andps %xmm4, %xmm2
+; NOBMI-NEXT: andps %xmm5, %xmm3
+; NOBMI-NEXT: andnps %xmm1, %xmm3
+; NOBMI-NEXT: andnps %xmm0, %xmm2
+; NOBMI-NEXT: andnps %xmm6, %xmm4
+; NOBMI-NEXT: andnps %xmm2, %xmm4
+; NOBMI-NEXT: andnps %xmm7, %xmm5
+; NOBMI-NEXT: andnps %xmm3, %xmm5
+; NOBMI-NEXT: movaps %xmm4, %xmm0
+; NOBMI-NEXT: movaps %xmm5, %xmm1
+; NOBMI-NEXT: retq
+;
+; BMI-LABEL: test_v32i8:
+; BMI: # %bb.0: # %Entry
+; BMI-NEXT: vandps %ymm1, %ymm2, %ymm1
+; BMI-NEXT: vandnps %ymm0, %ymm1, %ymm0
+; BMI-NEXT: vandnps %ymm3, %ymm2, %ymm1
+; BMI-NEXT: vandnps %ymm0, %ymm1, %ymm0
+; BMI-NEXT: retq
+Entry:
+ %and1 = and <32 x i8> %y, %x
+ %xor1 = xor <32 x i8> %and1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %and2 = and <32 x i8> %xor1, %w
+ %.not = xor <32 x i8> %z, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %or1 = or <32 x i8> %.not, %y
+ %and3 = and <32 x i8> %and2, %or1
+ ret <32 x i8> %and3
+}
+
; PR112347 - don't fold if we'd be inverting a constant, as demorgan normalisation will invert it back again.
define void @PR112347(ptr %p0, ptr %p1, ptr %p2) {
; CHECK-LABEL: PR112347:
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 220c2e5..a2bcadd 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2465,7 +2465,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0]
; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221]
; CHECK-AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3
-; CHECK-AVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; CHECK-AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
; CHECK-AVX512VL-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,128,1,1,1,128,1,64,128,1,128,1,128,32,1,1]
@@ -2483,7 +2483,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX512VL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
; CHECK-AVX512VL-NEXT: vpandn %ymm0, %ymm3, %ymm3
; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0
-; CHECK-AVX512VL-NEXT: vpternlogq $14, %ymm3, %ymm2, %ymm0
+; CHECK-AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 & (ymm2 | ymm3)
; CHECK-AVX512VL-NEXT: retq
%rem = srem <32 x i8> %x, <i8 13, i8 5, i8 19, i8 34, i8 2, i8 8, i8 2, i8 88, i8 62, i8 62, i8 5, i8 7, i8 97, i8 2, i8 3, i8 60, i8 3, i8 87, i8 7, i8 6, i8 84, i8 -128, i8 127, i8 56, i8 114, i8 1, i8 50, i8 7, i8 2, i8 8, i8 97, i8 117>
%cmp = icmp ne <32 x i8> %rem, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
new file mode 100644
index 0000000..0385017
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=-sse,-avx | FileCheck %s
+
+@.str = private unnamed_addr constant [6 x i8] c"%d %d\00", align 1
+
+define void @caller(i32 %0, i32 %1) #0 {
+; CHECK-LABEL: caller:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %r11
+; CHECK-NEXT: pushq %r10
+; CHECK-NEXT: pushq %r9
+; CHECK-NEXT: pushq %r8
+; CHECK-NEXT: pushq %rdx
+; CHECK-NEXT: pushq %rcx
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: movl %edi, %esi
+; CHECK-NEXT: movl $.L.str, %edi
+; CHECK-NEXT: callq printf@PLT
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: popq %rdx
+; CHECK-NEXT: popq %r8
+; CHECK-NEXT: popq %r9
+; CHECK-NEXT: popq %r10
+; CHECK-NEXT: popq %r11
+; CHECK-NEXT: retq
+ %3 = tail call i32 @printf(ptr @.str, i32 %0, i32 %1)
+ ret void
+}
+
+declare i32 @printf(ptr, ...) nounwind
+
+attributes #0 = { mustprogress nounwind "no_caller_saved_registers" }
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index b486014..21dfdc3 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -8672,6 +8672,263 @@ entry:
ret <4 x double> %atan
}
+define <1 x float> @constrained_vector_atan2_v1f32() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v1f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: callq atan2f@PLT
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+;
+; AVX-LABEL: constrained_vector_atan2_v1f32:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %rax
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: callq atan2f@PLT
+; AVX-NEXT: popq %rax
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %atan2 = call <1 x float> @llvm.experimental.constrained.atan2.v1f32(
+ <1 x float> <float 42.0>,
+ <1 x float> <float 23.0>,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <1 x float> %atan2
+}
+
+define <2 x double> @constrained_vector_atan2_v2f64() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v2f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+;
+; AVX-LABEL: constrained_vector_atan2_v2f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: subq $24, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: addq $24, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %atan2 = call <2 x double> @llvm.experimental.constrained.atan2.v2f64(
+ <2 x double> <double 42.0, double 42.1>,
+ <2 x double> <double 23.0, double 23.1>,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <2 x double> %atan2
+}
+
+define <3 x float> @constrained_vector_atan2_v3f32() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v3f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $40, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: callq atan2f@PLT
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: callq atan2f@PLT
+; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: callq atan2f@PLT
+; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+;
+; AVX-LABEL: constrained_vector_atan2_v3f32:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 48
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: callq atan2f@PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: callq atan2f@PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: callq atan2f@PLT
+; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %atan2 = call <3 x float> @llvm.experimental.constrained.atan2.v3f32(
+ <3 x float> <float 42.0, float 43.0, float 44.0>,
+ <3 x float> <float 23.0, float 24.0, float 25.0>,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <3 x float> %atan2
+}
+
+define <3 x double> @constrained_vector_atan2_v3f64() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v3f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT: wait
+; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: # xmm1 = mem[0],zero
+; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+;
+; AVX-LABEL: constrained_vector_atan2_v3f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 48
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %atan2 = call <3 x double> @llvm.experimental.constrained.atan2.v3f64(
+ <3 x double> <double 42.0, double 42.1, double 42.2>,
+ <3 x double> <double 23.0, double 23.1, double 23.2>,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <3 x double> %atan2
+}
+
+define <4 x double> @constrained_vector_atan2_v4f64() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v4f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $40, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3300000000000001E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0]
+; CHECK-NEXT: callq atan2@PLT
+; CHECK-NEXT: movaps %xmm0, %xmm1
+; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+;
+; AVX-LABEL: constrained_vector_atan2_v4f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 48
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3300000000000001E+1,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; AVX-NEXT: callq atan2@PLT
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %atan2 = call <4 x double> @llvm.experimental.constrained.atan2.v4f64(
+ <4 x double> <double 42.0, double 42.1,
+ double 42.2, double 42.3>,
+ <4 x double> <double 23.0, double 23.1,
+ double 23.2, double 23.3>,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <4 x double> %atan2
+}
+
define <1 x float> @constrained_vector_cosh_v1f32() #0 {
; CHECK-LABEL: constrained_vector_cosh_v1f32:
; CHECK: # %bb.0: # %entry
@@ -9546,6 +9803,7 @@ declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, meta
declare <4 x double> @llvm.experimental.constrained.asin.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.acos.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.atan.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.atan2.v4f64(<4 x double>, <4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.sinh.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.cosh.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.tanh.v4f64(<4 x double>, metadata, metadata)
diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll
index 80f8b15..ada22c3 100644
--- a/llvm/test/Feature/fp-intrinsics.ll
+++ b/llvm/test/Feature/fp-intrinsics.ll
@@ -184,7 +184,7 @@ entry:
ret double %result
}
-; Verify that atan(42.0) isn't simplified when the rounding mode is unknown.
+; Verify that atan(42.0, 23.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: fatan
; CHECK: call double @llvm.experimental.constrained.atan
define double @fatan() #0 {
@@ -195,6 +195,19 @@ entry:
ret double %result
}
+; Verify that atan2(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: fatan2
+; CHECK: call double @llvm.experimental.constrained.atan2
+define double @fatan2() #0 {
+entry:
+ %result = call double @llvm.experimental.constrained.atan2.f64(
+ double 42.0,
+ double 23.0,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
; Verify that cosh(42.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: fcosh
; CHECK: call double @llvm.experimental.constrained.cosh
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 67038f4c..210d558 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -244,49 +244,67 @@ v_add_lshl_u32 v5, src_scc, vcc_lo, -1
v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null
// GFX11: encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
-v_add_nc_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
-v_add_nc_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
-v_add_nc_i16 v5, s1, s2
+v_add_nc_i16 v5.l, s1, s2
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
-v_add_nc_i16 v5, s105, s105
+v_add_nc_i16 v5.l, s105, s105
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
-v_add_nc_i16 v5, vcc_lo, ttmp15
+v_add_nc_i16 v5.l, vcc_lo, ttmp15
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
-v_add_nc_i16 v5, vcc_hi, 0xfe0b
+v_add_nc_i16 v5.l, vcc_hi, 0xfe0b
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-v_add_nc_i16 v5, ttmp15, src_scc
+v_add_nc_i16 v5.l, ttmp15, src_scc
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
-v_add_nc_i16 v5, m0, 0.5
+v_add_nc_i16 v5.l, m0, 0.5
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00]
-v_add_nc_i16 v5, exec_lo, -1
+v_add_nc_i16 v5.l, exec_lo, -1
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
-v_add_nc_i16 v5, exec_hi, null
+v_add_nc_i16 v5.l, exec_hi, null
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
-v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_i16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_add_nc_i16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
// GFX11: encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
-v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
-v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00]
-v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
-v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_i16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
// GFX11: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
v_add_nc_i32 v5, v1, v2
@@ -334,49 +352,67 @@ v_add_nc_i32 v5, src_scc, vcc_lo
v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp
// GFX11: encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
-v_add_nc_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
-v_add_nc_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
-v_add_nc_u16 v5, s1, s2
+v_add_nc_u16 v5.l, s1, s2
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
-v_add_nc_u16 v5, s105, s105
+v_add_nc_u16 v5.l, s105, s105
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
-v_add_nc_u16 v5, vcc_lo, ttmp15
+v_add_nc_u16 v5.l, vcc_lo, ttmp15
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
-v_add_nc_u16 v5, vcc_hi, 0xfe0b
+v_add_nc_u16 v5.l, vcc_hi, 0xfe0b
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-v_add_nc_u16 v5, ttmp15, src_scc
+v_add_nc_u16 v5.l, ttmp15, src_scc
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
-v_add_nc_u16 v5, m0, 0.5
+v_add_nc_u16 v5.l, m0, 0.5
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00]
-v_add_nc_u16 v5, exec_lo, -1
+v_add_nc_u16 v5.l, exec_lo, -1
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
-v_add_nc_u16 v5, exec_hi, null
+v_add_nc_u16 v5.l, exec_hi, null
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
-v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_u16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_add_nc_u16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
// GFX11: encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x00]
-v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
-v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
// GFX11: encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00]
-v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
-v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_u16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
// GFX11: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
v_alignbit_b32 v5, v1, v2, s3
@@ -5801,49 +5837,67 @@ v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo
v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp
// GFX11: encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
-v_sub_nc_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
-v_sub_nc_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
-v_sub_nc_i16 v5, s1, s2
+v_sub_nc_i16 v5.l, s1, s2
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
-v_sub_nc_i16 v5, s105, s105
+v_sub_nc_i16 v5.l, s105, s105
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
-v_sub_nc_i16 v5, vcc_lo, ttmp15
+v_sub_nc_i16 v5.l, vcc_lo, ttmp15
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
-v_sub_nc_i16 v5, vcc_hi, 0xfe0b
+v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-v_sub_nc_i16 v5, ttmp15, src_scc
+v_sub_nc_i16 v5.l, ttmp15, src_scc
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
-v_sub_nc_i16 v5, m0, 0.5
+v_sub_nc_i16 v5.l, m0, 0.5
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00]
-v_sub_nc_i16 v5, exec_lo, -1
+v_sub_nc_i16 v5.l, exec_lo, -1
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
-v_sub_nc_i16 v5, exec_hi, null
+v_sub_nc_i16 v5.l, exec_hi, null
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
-v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_i16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_sub_nc_i16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
// GFX11: encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
-v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
-v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00]
-v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
-v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
// GFX11: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
v_sub_nc_i32 v5, v1, v2
@@ -5891,49 +5945,67 @@ v_sub_nc_i32 v5, src_scc, vcc_lo
v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp
// GFX11: encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
-v_sub_nc_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
-v_sub_nc_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
-v_sub_nc_u16 v5, s1, s2
+v_sub_nc_u16 v5.l, s1, s2
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
-v_sub_nc_u16 v5, s105, s105
+v_sub_nc_u16 v5.l, s105, s105
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
-v_sub_nc_u16 v5, vcc_lo, ttmp15
+v_sub_nc_u16 v5.l, vcc_lo, ttmp15
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
-v_sub_nc_u16 v5, vcc_hi, 0xfe0b
+v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-v_sub_nc_u16 v5, ttmp15, src_scc
+v_sub_nc_u16 v5.l, ttmp15, src_scc
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
-v_sub_nc_u16 v5, m0, 0.5
+v_sub_nc_u16 v5.l, m0, 0.5
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00]
-v_sub_nc_u16 v5, exec_lo, -1
+v_sub_nc_u16 v5.l, exec_lo, -1
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
-v_sub_nc_u16 v5, exec_hi, null
+v_sub_nc_u16 v5.l, exec_hi, null
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
-v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_u16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_sub_nc_u16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
// GFX11: encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x00]
-v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
-v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
// GFX11: encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00]
-v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
-v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
// GFX11: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
v_subrev_co_u32 v5, s6, v1, v2
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index 3c693c5..c82b61e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -194,47 +194,47 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bo
v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX11: [0xff,0x00,0x47,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
// GFX11: [0x05,0x00,0x26,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -278,47 +278,47 @@ v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX11: [0xff,0x80,0x26,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
// GFX11: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4116,47 +4116,47 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:
v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX11: [0xff,0xfc,0x01,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
// GFX11: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4200,47 +4200,47 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX11: [0xff,0x80,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0]
// W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4475,30 +4475,6 @@ v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX11: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
// GFX11: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
@@ -4724,30 +4700,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban
v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
// GFX11: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
// GFX11: encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index 7970927..7336968 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -114,14 +114,23 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX11: [0xff,0x00,0x47,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [0x05,0x00,0x26,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -132,14 +141,23 @@ v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX11: [0xff,0x80,0x26,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2601,14 +2619,23 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX11: [0xff,0xfc,0x01,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2619,14 +2646,23 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX11: [0xff,0x80,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
// W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2748,30 +2784,6 @@ v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX11: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
@@ -2997,30 +3009,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1
// GFX11: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
// GFX11: encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index f28933e..1ae1eaf 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -208,49 +208,58 @@ v_add_lshl_u32 v5, src_scc, vcc_lo, -1
v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null
// GFX12: encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
-v_add_nc_i16 v5, v1, v2
+v_add_nc_i16 v5.l, v1.l, v2.l
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
-v_add_nc_i16 v5, v255, v255
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_i16 v5.l, v255.l, v255.l
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
-v_add_nc_i16 v5, s1, s2
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_i16 v5.l, s1, s2
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
-v_add_nc_i16 v5, s105, s105
+v_add_nc_i16 v5.l, s105, s105
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
-v_add_nc_i16 v5, vcc_lo, ttmp15
+v_add_nc_i16 v5.l, vcc_lo, ttmp15
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
-v_add_nc_i16 v5, vcc_hi, 0xfe0b
+v_add_nc_i16 v5.l, vcc_hi, 0xfe0b
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-v_add_nc_i16 v5, ttmp15, src_scc
+v_add_nc_i16 v5.l, ttmp15, src_scc
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
-v_add_nc_i16 v5, m0, 0.5
+v_add_nc_i16 v5.l, m0, 0.5
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00]
-v_add_nc_i16 v5, exec_lo, -1
+v_add_nc_i16 v5.l, exec_lo, -1
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
-v_add_nc_i16 v5, exec_hi, null
+v_add_nc_i16 v5.l, exec_hi, null
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
-v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
// GFX12: encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
-v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
// GFX12: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
-v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
// GFX12: encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00]
-v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
// GFX12: encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
-v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
// GFX12: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
v_add_nc_i32 v5, v1, v2
@@ -298,49 +307,58 @@ v_add_nc_i32 v5, src_scc, vcc_lo
v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp
// GFX12: encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
-v_add_nc_u16 v5, v1, v2
+v_add_nc_u16 v5.l, v1.l, v2.l
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
-v_add_nc_u16 v5, v255, v255
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5.l, v255.l, v255.l
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
-v_add_nc_u16 v5, s1, s2
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_u16 v5.l, s1, s2
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
-v_add_nc_u16 v5, s105, s105
+v_add_nc_u16 v5.l, s105, s105
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
-v_add_nc_u16 v5, vcc_lo, ttmp15
+v_add_nc_u16 v5.l, vcc_lo, ttmp15
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
-v_add_nc_u16 v5, vcc_hi, 0xfe0b
+v_add_nc_u16 v5.l, vcc_hi, 0xfe0b
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-v_add_nc_u16 v5, ttmp15, src_scc
+v_add_nc_u16 v5.l, ttmp15, src_scc
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
-v_add_nc_u16 v5, m0, 0.5
+v_add_nc_u16 v5.l, m0, 0.5
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00]
-v_add_nc_u16 v5, exec_lo, -1
+v_add_nc_u16 v5.l, exec_lo, -1
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
-v_add_nc_u16 v5, exec_hi, null
+v_add_nc_u16 v5.l, exec_hi, null
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
-v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
// GFX12: encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x00]
-v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
// GFX12: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
-v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
// GFX12: encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00]
-v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
// GFX12: encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
-v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
// GFX12: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
v_alignbit_b32 v5, v1, v2, s3
@@ -5696,49 +5714,58 @@ v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo
v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp
// GFX12: encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
-v_sub_nc_i16 v5, v1, v2
+v_sub_nc_i16 v5.l, v1.l, v2.l
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
-v_sub_nc_i16 v5, v255, v255
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_i16 v5.l, v255.l, v255.l
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
-v_sub_nc_i16 v5, s1, s2
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_i16 v5.l, s1, s2
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
-v_sub_nc_i16 v5, s105, s105
+v_sub_nc_i16 v5.l, s105, s105
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
-v_sub_nc_i16 v5, vcc_lo, ttmp15
+v_sub_nc_i16 v5.l, vcc_lo, ttmp15
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
-v_sub_nc_i16 v5, vcc_hi, 0xfe0b
+v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-v_sub_nc_i16 v5, ttmp15, src_scc
+v_sub_nc_i16 v5.l, ttmp15, src_scc
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
-v_sub_nc_i16 v5, m0, 0.5
+v_sub_nc_i16 v5.l, m0, 0.5
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00]
-v_sub_nc_i16 v5, exec_lo, -1
+v_sub_nc_i16 v5.l, exec_lo, -1
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
-v_sub_nc_i16 v5, exec_hi, null
+v_sub_nc_i16 v5.l, exec_hi, null
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
-v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
// GFX12: encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
-v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
// GFX12: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
-v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
// GFX12: encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00]
-v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
// GFX12: encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
-v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
// GFX12: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
v_sub_nc_i32 v5, v1, v2
@@ -5786,49 +5813,58 @@ v_sub_nc_i32 v5, src_scc, vcc_lo
v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp
// GFX12: encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
-v_sub_nc_u16 v5, v1, v2
+v_sub_nc_u16 v5.l, v1.l, v2.l
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
-v_sub_nc_u16 v5, v255, v255
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5.l, v255.l, v255.l
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
-v_sub_nc_u16 v5, s1, s2
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_u16 v5.l, s1, s2
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
-v_sub_nc_u16 v5, s105, s105
+v_sub_nc_u16 v5.l, s105, s105
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
-v_sub_nc_u16 v5, vcc_lo, ttmp15
+v_sub_nc_u16 v5.l, vcc_lo, ttmp15
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
-v_sub_nc_u16 v5, vcc_hi, 0xfe0b
+v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
-v_sub_nc_u16 v5, ttmp15, src_scc
+v_sub_nc_u16 v5.l, ttmp15, src_scc
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
-v_sub_nc_u16 v5, m0, 0.5
+v_sub_nc_u16 v5.l, m0, 0.5
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00]
-v_sub_nc_u16 v5, exec_lo, -1
+v_sub_nc_u16 v5.l, exec_lo, -1
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
-v_sub_nc_u16 v5, exec_hi, null
+v_sub_nc_u16 v5.l, exec_hi, null
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
-v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
// GFX12: encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x00]
-v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
// GFX12: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
-v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
// GFX12: encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00]
-v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
// GFX12: encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
-v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
// GFX12: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
v_subrev_co_u32 v5, s6, v1, v2
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index adf3790..56bd0ee 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -214,47 +214,71 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bo
v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX12: [0xff,0x00,0x47,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
// GFX12: [0x05,0x00,0x26,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -298,47 +322,71 @@ v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX12: [0xff,0x80,0x26,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
// GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4622,47 +4670,71 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:
v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX12: [0xff,0xfc,0x01,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
// GFX12: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4706,47 +4778,71 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX12: [0xff,0x80,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0]
// W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -5001,30 +5097,6 @@ v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX12: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
// GFX12: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
@@ -5250,30 +5322,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban
v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
// GFX12: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
// GFX12: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 1be122f..6331d22 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -134,14 +134,38 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX12: [0xff,0x00,0x47,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
// GFX12: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x26,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -152,14 +176,38 @@ v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX12: [0xff,0x80,0x26,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
// GFX12: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3043,14 +3091,38 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX12: [0xff,0xfc,0x01,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
// GFX12: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3061,14 +3133,38 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX12: [0xff,0x80,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
// GFX12: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
// W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3210,30 +3306,6 @@ v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX12: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
// GFX12: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
@@ -3459,30 +3531,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1
// GFX12: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
// GFX12: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index 07058a6..365caa5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -189,49 +189,112 @@
# GFX11: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
-# GFX11: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00
-# GFX11: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00
-# GFX11: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00
-# GFX11: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00
-# GFX11: v_add_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00
-# GFX11: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00
-# GFX11: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00
-# GFX11: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00
-# GFX11: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_add_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
# GFX11: v_add_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x26,0xd7,0x01,0x05,0x02,0x00]
@@ -279,49 +342,112 @@
# GFX11: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
-# GFX11: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00
-# GFX11: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00
-# GFX11: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00
-# GFX11: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00
-# GFX11: v_add_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00
-# GFX11: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00
-# GFX11: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00
-# GFX11: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00
-# GFX11: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_add_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
# GFX11: v_alignbit_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00]
@@ -5871,49 +5997,112 @@
# GFX11: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
-# GFX11: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00
-# GFX11: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00
-# GFX11: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00
-# GFX11: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00
-# GFX11: v_sub_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00
-# GFX11: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00
-# GFX11: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00
-# GFX11: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00
-# GFX11: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_sub_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
# GFX11: v_sub_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x00]
@@ -5961,49 +6150,112 @@
# GFX11: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
-# GFX11: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00
-# GFX11: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00
-# GFX11: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00
-# GFX11: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00
-# GFX11: v_sub_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00
-# GFX11: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00
-# GFX11: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00
-# GFX11: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00
-# GFX11: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_sub_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
# W32: v_subrev_co_u32 v5, s12, v1, v2 ; encoding: [0x05,0x0c,0x02,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index 4ae8b05..d0bd639 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -3824,88 +3824,220 @@
# W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
# GFX11: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4664,88 +4796,220 @@
# GFX11: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
# GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index b44dba7..cbf5a3d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -2168,34 +2168,112 @@
# W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
# GFX11: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2840,34 +2918,112 @@
# GFX11: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
# GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index af04a31..c87c885 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -153,49 +153,112 @@
# GFX12: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
-# GFX12: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00
-# GFX12: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00
-# GFX12: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00
-# GFX12: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00
-# GFX12: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00
-# GFX12: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
-# GFX12: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00
-# GFX12: v_add_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00
-# GFX12: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00
-# GFX12: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00
-# GFX12: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00
-# GFX12: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00
-# GFX12: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00
-# GFX12: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00
-# GFX12: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
# GFX12: v_add_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x26,0xd7,0x01,0x05,0x02,0x00]
@@ -243,49 +306,112 @@
# GFX12: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
-# GFX12: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00
-# GFX12: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00
-# GFX12: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00
-# GFX12: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00
-# GFX12: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00
-# GFX12: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
-# GFX12: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00
-# GFX12: v_add_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00
-# GFX12: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00
-# GFX12: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00
-# GFX12: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00
-# GFX12: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00
-# GFX12: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00
-# GFX12: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00
-# GFX12: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
# GFX12: v_alignbit_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00]
@@ -5797,49 +5923,112 @@
# GFX12: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
-# GFX12: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00
-# GFX12: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00
-# GFX12: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00
-# GFX12: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00
-# GFX12: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00
-# GFX12: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
-# GFX12: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00
-# GFX12: v_sub_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00
-# GFX12: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00
-# GFX12: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00
-# GFX12: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00
-# GFX12: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00
-# GFX12: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00
-# GFX12: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00
-# GFX12: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
# GFX12: v_sub_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x00]
@@ -5887,49 +6076,112 @@
# GFX12: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
-# GFX12: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00
-# GFX12: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00
-# GFX12: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00
-# GFX12: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00
-# GFX12: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00
-# GFX12: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
-# GFX12: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00
-# GFX12: v_sub_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00
-# GFX12: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00
-# GFX12: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00
-# GFX12: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00
-# GFX12: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00
-# GFX12: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00
-# GFX12: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00
-# GFX12: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
# W32: v_subrev_co_u32 v5, s12, v1, v2 ; encoding: [0x05,0x0c,0x02,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 65cfdd5..5081b98 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -4115,88 +4115,268 @@
# W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
# GFX12: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -5000,88 +5180,268 @@
# GFX12: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
# GFX12: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 4640b96..77f0502 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -2393,34 +2393,160 @@
# W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
# GFX12: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3113,34 +3239,160 @@
# GFX12: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
# GFX12: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
diff --git a/llvm/test/MC/RISCV/rv32c-valid.s b/llvm/test/MC/RISCV/rv32c-valid.s
index bcdf27a..9b0ca80 100644
--- a/llvm/test/MC/RISCV/rv32c-valid.s
+++ b/llvm/test/MC/RISCV/rv32c-valid.s
@@ -147,8 +147,7 @@ c.sub a4, a5
# CHECK-ASM: encoding: [0x01,0x00]
# CHECK-NO-EXT: error: instruction requires the following: 'C' (Compressed Instructions) or 'Zca' (part of the C extension, excluding compressed floating point loads/stores){{$}}
c.nop
-# CHECK-ASM: c.addi zero, 0
-# CHECK-OBJ: c.nop
+# CHECK-ASM-AND-OBJ: c.nop
# CHECK-ASM: encoding: [0x01,0x00]
# CHECK-NO-EXT: error: instruction requires the following: 'C' (Compressed Instructions) or 'Zca' (part of the C extension, excluding compressed floating point loads/stores){{$}}
c.addi x0, 0
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
index 058847a..cc55c4a 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
@@ -689,6 +689,17 @@ define i1 @sge_and_max_logical(i8 %x, i8 %y) {
ret i1 %r
}
+define i1 @sge_and_max_logical_samesign(i8 %x, i8 %y) {
+; CHECK-LABEL: @sge_and_max_logical_samesign(
+; CHECK-NEXT: [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
+; CHECK-NEXT: ret i1 [[CMPEQ]]
+;
+ %cmp = icmp sge i8 %x, %y
+ %cmpeq = icmp samesign eq i8 %x, 127
+ %r = select i1 %cmp, i1 %cmpeq, i1 false
+ ret i1 %r
+}
+
define i1 @sge_and_max_commute(i8 %x, i8 %y) {
; CHECK-LABEL: @sge_and_max_commute(
; CHECK-NEXT: [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
index d533cc70..8650b89 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
@@ -592,6 +592,19 @@ define i1 @sgt_and_min_logical(ptr %x, ptr %y) {
ret i1 %r
}
+define i1 @sgt_and_min_logical_samesign(ptr %x, ptr %y) {
+; CHECK-LABEL: @sgt_and_min_logical_samesign(
+; CHECK-NEXT: [[CMPEQ:%.*]] = icmp eq ptr [[X:%.*]], null
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt ptr [[Y:%.*]], null
+; CHECK-NEXT: [[R:%.*]] = and i1 [[CMPEQ]], [[TMP1]]
+; CHECK-NEXT: ret i1 [[R]]
+;
+ %cmp = icmp sgt ptr %x, %y
+ %cmpeq = icmp samesign eq ptr %x, null
+ %r = select i1 %cmp, i1 %cmpeq, i1 false
+ ret i1 %r
+}
+
define i1 @sle_or_not_min(ptr %x, ptr %y) {
; CHECK-LABEL: @sle_or_not_min(
; CHECK-NEXT: [[CMPEQ:%.*]] = icmp ne ptr [[X:%.*]], null
diff --git a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
index 684ece2..d092363 100644
--- a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
@@ -619,6 +619,19 @@ define i1 @test_shr_and_1_ne_0(i32 %a, i32 %b) {
ret i1 %cmp
}
+define i1 @test_shr_and_1_ne_0_samesign(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_shr_and_1_ne_0_samesign(
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[A:%.*]], [[TMP1]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr i32 %a, %b
+ %and = and i32 %shr, 1
+ %cmp = icmp samesign ne i32 %and, 0
+ ret i1 %cmp
+}
+
define i1 @test_const_shr_and_1_ne_0(i32 %b) {
; CHECK-LABEL: @test_const_shr_and_1_ne_0(
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-equality-test.ll b/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
index c2740ca..b9d8f2d 100644
--- a/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
@@ -33,6 +33,22 @@ entry:
ret i1 %equal
}
+define i1 @icmp_equality_test_constant_samesign(i42 %X, i42 %Y) {
+; CHECK-LABEL: @icmp_equality_test_constant_samesign(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[XEQY:%.*]] = icmp eq i42 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: ret i1 [[XEQY]]
+;
+entry:
+ %XeqC = icmp eq i42 %X, -42
+ %YeqC = icmp eq i42 %Y, -42
+ %XeqY = icmp samesign eq i42 %X, %Y
+ %not.YeqC = xor i1 %YeqC, true
+ %and = select i1 %not.YeqC, i1 %XeqY, i1 false
+ %equal = select i1 %XeqC, i1 %YeqC, i1 %and
+ ret i1 %equal
+}
+
define i1 @icmp_equality_test_swift_optional_pointers(i64 %X, i64 %Y) {
; CHECK-LABEL: @icmp_equality_test_swift_optional_pointers(
; CHECK-NEXT: entry:
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 5e80134..7cafb48 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -3203,6 +3203,21 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {
ret i1 %ret
}
+define i1 @icmp_and_or_lshr_samesign(i32 %x, i32 %y) {
+; CHECK-LABEL: @icmp_and_or_lshr_samesign(
+; CHECK-NEXT: [[SHF1:%.*]] = shl nuw i32 1, [[Y:%.*]]
+; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHF1]], 1
+; CHECK-NEXT: [[AND3:%.*]] = and i32 [[X:%.*]], [[OR2]]
+; CHECK-NEXT: [[RET:%.*]] = icmp ne i32 [[AND3]], 0
+; CHECK-NEXT: ret i1 [[RET]]
+;
+ %shf = lshr i32 %x, %y
+ %or = or i32 %shf, %x
+ %and = and i32 %or, 1
+ %ret = icmp samesign ne i32 %and, 0
+ ret i1 %ret
+}
+
define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: @icmp_and_or_lshr_vec(
; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/select-cmp.ll b/llvm/test/Transforms/InstCombine/select-cmp.ll
index 2348159..f7505bd 100644
--- a/llvm/test/Transforms/InstCombine/select-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-cmp.ll
@@ -480,6 +480,95 @@ define i1 @test_select_inverse_nonconst4(i64 %x, i64 %y, i64 %z, i1 %cond) {
ret i1 %sel
}
+define i1 @test_select_inverse_samesign_true_arm(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_true_arm(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp uge i64 [[X]], [[Y]]
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT: ret i1 [[SEL]]
+;
+ %cmp1 = icmp samesign ult i64 %x, %y
+ %cmp2 = icmp uge i64 %x, %y
+ %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+ ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_false_arm(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_false_arm(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign uge i64 [[X]], [[Y]]
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT: ret i1 [[SEL]]
+;
+ %cmp1 = icmp ult i64 %x, %y
+ %cmp2 = icmp samesign uge i64 %x, %y
+ %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+ ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_both(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_both(
+; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign uge i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[SEL]]
+;
+ %cmp1 = icmp samesign ult i64 %x, %y
+ %cmp2 = icmp samesign uge i64 %x, %y
+ %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+ ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_false_arm_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_false_arm_rhsc_same_sign(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[X:%.*]], 11
+; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ugt i64 [[X]], 10
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT: ret i1 [[SEL]]
+;
+ %cmp1 = icmp ult i64 %x, 11
+ %cmp2 = icmp samesign ugt i64 %x, 10
+ %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+ ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_true_arm_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_true_arm_rhsc_same_sign(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ult i64 [[X:%.*]], 11
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i64 [[X]], 10
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT: ret i1 [[SEL]]
+;
+ %cmp1 = icmp samesign ult i64 %x, 11
+ %cmp2 = icmp ugt i64 %x, 10
+ %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+ ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_both_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_both_rhsc_same_sign(
+; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ugt i64 [[X:%.*]], 10
+; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[SEL]]
+;
+ %cmp1 = icmp samesign ult i64 %x, 11
+ %cmp2 = icmp samesign ugt i64 %x, 10
+ %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+ ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_both_rhsc_diff_sign(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_both_rhsc_diff_sign(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign slt i64 [[X:%.*]], 0
+; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign sgt i64 [[X]], -1
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT: ret i1 [[SEL]]
+;
+ %cmp1 = icmp samesign slt i64 %x, 0
+ %cmp2 = icmp samesign sgt i64 %x, -1
+ %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+ ret i1 %sel
+}
+
define i1 @sel_icmp_two_cmp(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
; CHECK-LABEL: @sel_icmp_two_cmp(
; CHECK-NEXT: [[CMP1:%.*]] = icmp ule i32 [[A1:%.*]], [[A2:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/select-icmp-xor.ll b/llvm/test/Transforms/InstCombine/select-icmp-xor.ll
new file mode 100644
index 0000000..c8ce114
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-icmp-xor.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S %s | FileCheck %s
+
+define i8 @select_icmp_eq_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[SEL:%.*]] = and i8 [[X]], -5
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %and = and i8 %x, 4
+ %icmp = icmp eq i8 %and, 0
+ %xor = xor i8 %x, 4
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
+
+define i8 @select_icmp_eq_pow2_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_pow2_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[SEL:%.*]] = or i8 [[X]], 4
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %and = and i8 %x, 4
+ %icmp = icmp eq i8 %and, 0
+ %xor = xor i8 %x, 4
+ %sel = select i1 %icmp, i8 %xor, i8 %x
+ ret i8 %sel
+}
+
+define i8 @select_icmp_eq_not_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_not_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 5
+; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], 5
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %and = and i8 %x, 5
+ %icmp = icmp eq i8 %and, 0
+ %xor = xor i8 %x, 5
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
+
+define i8 @select_icmp_ne_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[SEL:%.*]] = and i8 [[X]], -5
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %and = and i8 %x, 4
+ %icmp = icmp ne i8 %and, 0
+ %xor = xor i8 %x, 4
+ %sel = select i1 %icmp, i8 %xor, i8 %x
+ ret i8 %sel
+}
+
+define i8 @select_icmp_ne_pow2_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_pow2_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[SEL:%.*]] = or i8 [[X]], 4
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %and = and i8 %x, 4
+ %icmp = icmp ne i8 %and, 0
+ %xor = xor i8 %x, 4
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
+
+define i8 @select_icmp_ne_not_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_not_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 5
+; CHECK-NEXT: [[ICMP_NOT:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], 5
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP_NOT]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %and = and i8 %x, 5
+ %icmp = icmp ne i8 %and, 0
+ %xor = xor i8 %x, 5
+ %sel = select i1 %icmp, i8 %xor, i8 %x
+ ret i8 %sel
+}
+
+define i8 @select_icmp_slt_zero_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_zero_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[SEL:%.*]] = or i8 [[X]], -128
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %icmp = icmp slt i8 %x, 0
+ %xor = xor i8 %x, -128
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
+
+define i8 @select_icmp_slt_zero_smin_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_zero_smin_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[SEL:%.*]] = and i8 [[X]], 127
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %icmp = icmp slt i8 %x, 0
+ %xor = xor i8 %x, -128
+ %sel = select i1 %icmp, i8 %xor, i8 %x
+ ret i8 %sel
+}
+
+define i8 @select_icmp_slt_not_zero(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_not_zero(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[ICMP:%.*]] = icmp slt i8 [[X]], 1
+; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], -128
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %icmp = icmp slt i8 %x, 1
+ %xor = xor i8 %x, -128
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
+
+define i8 @select_icmp_slt_not_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_not_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[ICMP:%.*]] = icmp slt i8 [[X]], 0
+; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], -127
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %icmp = icmp slt i8 %x, 0
+ %xor = xor i8 %x, -127
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_allones_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_allones_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[SEL:%.*]] = and i8 [[X]], 127
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %icmp = icmp sgt i8 %x, 255
+ %xor = xor i8 %x, -128
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_allones_smin_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_allones_smin_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[SEL:%.*]] = or i8 [[X]], -128
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %icmp = icmp sgt i8 %x, 255
+ %xor = xor i8 %x, -128
+ %sel = select i1 %icmp, i8 %xor, i8 %x
+ ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_not_allones(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_not_allones(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[ICMP:%.*]] = icmp sgt i8 [[X]], -2
+; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], -128
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %icmp = icmp sgt i8 %x, 254
+ %xor = xor i8 %x, -128
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_not_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_not_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], -127
+; CHECK-NEXT: [[ICMP1:%.*]] = icmp slt i8 [[X]], 0
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP1]], i8 [[XOR]], i8 [[X]]
+; CHECK-NEXT: ret i8 [[SEL]]
+;
+ %icmp = icmp sgt i8 %x, 255
+ %xor = xor i8 %x, -127
+ %sel = select i1 %icmp, i8 %x, i8 %xor
+ ret i8 %sel
+}
diff --git a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
index 430baa1..5abdde9 100644
--- a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
+++ b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes=loop-versioning -S < %s | FileCheck %s -check-prefix=LV
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-versioning -S < %s | FileCheck %s
; NB: addrspaces 10-13 are non-integral
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
@@ -12,40 +13,113 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
declare i64 @julia_steprange_last_4949()
-define void @"japi1_align!_9477"(ptr %arg) {
-; LV-LAVEL: L26.lver.check
-; LV: [[OFMul:%[^ ]*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[Step:%[^ ]*]])
-; LV-NEXT: [[OFMulResult:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul]], 0
-; LV-NEXT: [[OFMulOverflow:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul]], 1
-; LV: [[OFNegMulResult:%[^ ]*]] = sub i64 0, [[OFMulResult]]
-; LV-NEXT: [[NegGEP:%[^ ]*]] = getelementptr i8, ptr addrspace(13) [[Base:%[^ ]*]], i64 [[OFNegMulResult]]
-; LV-NEXT: icmp ugt ptr addrspace(13) [[NegGEP]], [[Base]]
-; LV-NOT: inttoptr
-; LV-NOT: ptrtoint
+define void @wrapping_ptr_nonint_addrspace(ptr %arg) {
+; CHECK-LABEL: define void @wrapping_ptr_nonint_addrspace(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT: [[LOOP_LVER_CHECK:.*:]]
+; CHECK-NEXT: [[LOAD0:%.*]] = load ptr addrspace(10), ptr [[ARG]], align 8
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr inttoptr (i64 12 to ptr), align 4
+; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[LOAD1]]
+; CHECK-NEXT: [[CALL:%.*]] = call i64 @julia_steprange_last_4949()
+; CHECK-NEXT: [[CAST0:%.*]] = addrspacecast ptr addrspace(10) [[LOAD0]] to ptr addrspace(11)
+; CHECK-NEXT: [[LOAD2:%.*]] = load ptr addrspace(10), ptr addrspace(11) [[CAST0]], align 8
+; CHECK-NEXT: [[CAST1:%.*]] = addrspacecast ptr addrspace(10) [[LOAD2]] to ptr addrspace(11)
+; CHECK-NEXT: [[LOAD3:%.*]] = load ptr addrspace(13), ptr addrspace(11) [[CAST1]], align 8
+; CHECK-NEXT: [[SEXT:%.*]] = sext i32 [[SUB]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[CALL]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[SEXT]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], -4
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP3]]
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], -4
+; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP4]]
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr addrspace(13) [[SCEVGEP]], [[LOAD3]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr addrspace(13) [[SCEVGEP2]], [[SCEVGEP1]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 0, [[CALL]]
+; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[SEXT]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP6]], -4
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP7]]
+; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP5]])
+; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(13) [[SCEVGEP3]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt ptr addrspace(13) [[TMP9]], [[SCEVGEP3]]
+; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 -4
+; CHECK-NEXT: [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP5]])
+; CHECK-NEXT: [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
+; CHECK-NEXT: [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = sub i64 0, [[MUL_RESULT6]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(13) [[SCEVGEP4]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt ptr addrspace(13) [[TMP13]], [[SCEVGEP4]]
+; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW7]]
+; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP11]], [[TMP15]]
+; CHECK-NEXT: [[LVER_SAFE:%.*]] = or i1 [[FOUND_CONFLICT]], [[TMP16]]
+; CHECK-NEXT: br i1 [[LVER_SAFE]], label %[[LOOP_PH_LVER_ORIG:.*]], label %[[LOOP_PH:.*]]
+; CHECK: [[LOOP_PH_LVER_ORIG]]:
+; CHECK-NEXT: br label %[[LOOP_LVER_ORIG:.*]]
+; CHECK: [[LOOP_LVER_ORIG]]:
+; CHECK-NEXT: [[VALUE_PHI3_LVER_ORIG:%.*]] = phi i64 [ 0, %[[LOOP_PH_LVER_ORIG]] ], [ [[ADD0_LVER_ORIG:%.*]], %[[LOOP_LVER_ORIG]] ]
+; CHECK-NEXT: [[ADD0_LVER_ORIG]] = add i64 [[VALUE_PHI3_LVER_ORIG]], -1
+; CHECK-NEXT: [[GEP0_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD0_LVER_ORIG]]
+; CHECK-NEXT: [[LOAD4_LVER_ORIG:%.*]] = load i32, ptr addrspace(13) [[GEP0_LVER_ORIG]], align 4
+; CHECK-NEXT: [[ADD1_LVER_ORIG:%.*]] = add i64 [[ADD0_LVER_ORIG]], [[SEXT]]
+; CHECK-NEXT: [[GEP1_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD1_LVER_ORIG]]
+; CHECK-NEXT: store i32 [[LOAD4_LVER_ORIG]], ptr addrspace(13) [[GEP1_LVER_ORIG]], align 4
+; CHECK-NEXT: [[CMP_LVER_ORIG:%.*]] = icmp eq i64 [[VALUE_PHI3_LVER_ORIG]], [[CALL]]
+; CHECK-NEXT: br i1 [[CMP_LVER_ORIG]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_LVER_ORIG]]
+; CHECK: [[LOOP_PH]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[VALUE_PHI3:%.*]] = phi i64 [ 0, %[[LOOP_PH]] ], [ [[ADD0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ADD0]] = add i64 [[VALUE_PHI3]], -1
+; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD0]]
+; CHECK-NEXT: [[LOAD4:%.*]] = load i32, ptr addrspace(13) [[GEP0]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[ADD0]], [[SEXT]]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD1]]
+; CHECK-NEXT: store i32 [[LOAD4]], ptr addrspace(13) [[GEP1]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[VALUE_PHI3]], [[CALL]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT_LOOPEXIT8:.*]], label %[[LOOP]]
+; CHECK: [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT_LOOPEXIT8]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
top:
- %tmp = load ptr addrspace(10), ptr %arg, align 8
- %tmp1 = load i32, ptr inttoptr (i64 12 to ptr), align 4
- %tmp2 = sub i32 0, %tmp1
- %tmp3 = call i64 @julia_steprange_last_4949()
- %tmp4 = addrspacecast ptr addrspace(10) %tmp to ptr addrspace(11)
- %tmp6 = load ptr addrspace(10), ptr addrspace(11) %tmp4, align 8
- %tmp7 = addrspacecast ptr addrspace(10) %tmp6 to ptr addrspace(11)
- %tmp9 = load ptr addrspace(13), ptr addrspace(11) %tmp7, align 8
- %tmp10 = sext i32 %tmp2 to i64
- br label %L26
+ %load0 = load ptr addrspace(10), ptr %arg, align 8
+ %load1 = load i32, ptr inttoptr (i64 12 to ptr), align 4
+ %sub = sub i32 0, %load1
+ %call = call i64 @julia_steprange_last_4949()
+ %cast0 = addrspacecast ptr addrspace(10) %load0 to ptr addrspace(11)
+ %load2 = load ptr addrspace(10), ptr addrspace(11) %cast0, align 8
+ %cast1 = addrspacecast ptr addrspace(10) %load2 to ptr addrspace(11)
+ %load3 = load ptr addrspace(13), ptr addrspace(11) %cast1, align 8
+ %sext = sext i32 %sub to i64
+ br label %loop
-L26:
- %value_phi3 = phi i64 [ 0, %top ], [ %tmp11, %L26 ]
- %tmp11 = add i64 %value_phi3, -1
- %tmp12 = getelementptr inbounds i32, ptr addrspace(13) %tmp9, i64 %tmp11
- %tmp13 = load i32, ptr addrspace(13) %tmp12, align 4
- %tmp14 = add i64 %tmp11, %tmp10
- %tmp15 = getelementptr inbounds i32, ptr addrspace(13) %tmp9, i64 %tmp14
- store i32 %tmp13, ptr addrspace(13) %tmp15, align 4
- %tmp16 = icmp eq i64 %value_phi3, %tmp3
- br i1 %tmp16, label %L45, label %L26
+loop:
+ %value_phi3 = phi i64 [ 0, %top ], [ %add0, %loop ]
+ %add0 = add i64 %value_phi3, -1
+ %gep0 = getelementptr inbounds i32, ptr addrspace(13) %load3, i64 %add0
+ %load4 = load i32, ptr addrspace(13) %gep0, align 4
+ %add1 = add i64 %add0, %sext
+ %gep1 = getelementptr inbounds i32, ptr addrspace(13) %load3, i64 %add1
+ store i32 %load4, ptr addrspace(13) %gep1, align 4
+ %cmp = icmp eq i64 %value_phi3, %call
+ br i1 %cmp, label %exit, label %loop
-L45:
+exit:
ret void
}
-
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+;.
diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32 b/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32
deleted file mode 100755
index 5a606d5..0000000
--- a/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32
+++ /dev/null
Binary files differ
diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext b/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext
deleted file mode 100644
index 20fc381..0000000
--- a/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext
+++ /dev/null
@@ -1,4 +0,0 @@
-__main_argc_argv
-0x0
-1
-100
diff --git a/llvm/test/tools/llvm-cov/binary-formats.c b/llvm/test/tools/llvm-cov/binary-formats.c
index bb61b28..a5bfc01 100644
--- a/llvm/test/tools/llvm-cov/binary-formats.c
+++ b/llvm/test/tools/llvm-cov/binary-formats.c
@@ -10,11 +10,4 @@ int main(int argc, const char *argv[]) {}
// RUN: llvm-cov show %S/Inputs/binary-formats.v3.macho64l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
// RUN: llvm-cov show %S/Inputs/binary-formats.v6.linux64l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
-// RUN: llvm-profdata merge %S/Inputs/binary-formats.wasm.proftext -o %t.wasm.profdata
-// NOTE: The wasm binary is built with the following command:
-// clang -target wasm32-unknown-wasi %s -o %S/Inputs/binary-formats.v6.wasm32 \
-// -mllvm -enable-name-compression=false \
-// -fprofile-instr-generate -fcoverage-mapping -lwasi-emulated-getpid -lwasi-emulated-mman
-// RUN: llvm-cov show %S/Inputs/binary-formats.v6.wasm32 -instr-profile %t.wasm.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
-
// RUN: llvm-cov export %S/Inputs/binary-formats.macho64l -instr-profile %t.profdata | FileCheck %S/Inputs/binary-formats.canonical.json
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 7400b6c..dc40e58 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -200,6 +200,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
SDValue SMin = DAG->getNode(ISD::SMIN, DL, Int32VT, Op1, Op0);
SDValue UMax = DAG->getNode(ISD::UMAX, DL, Int32VT, Op0, Op1);
SDValue UMin = DAG->getNode(ISD::UMIN, DL, Int32VT, Op1, Op0);
+ SDValue Rotl = DAG->getNode(ISD::ROTL, DL, Int32VT, Op0, Op1);
+ SDValue Rotr = DAG->getNode(ISD::ROTR, DL, Int32VT, Op1, Op0);
SDValue ICMP_GT = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETGT);
SDValue ICMP_GE = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETGE);
@@ -246,6 +248,11 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
EXPECT_FALSE(sd_match(DisOr, m_Add(m_Value(), m_Value())));
EXPECT_TRUE(sd_match(DisOr, m_AddLike(m_Value(), m_Value())));
+ EXPECT_TRUE(sd_match(Rotl, m_Rotl(m_Value(), m_Value())));
+ EXPECT_TRUE(sd_match(Rotr, m_Rotr(m_Value(), m_Value())));
+ EXPECT_FALSE(sd_match(Rotl, m_Rotr(m_Value(), m_Value())));
+ EXPECT_FALSE(sd_match(Rotr, m_Rotl(m_Value(), m_Value())));
+
EXPECT_TRUE(sd_match(SMax, m_c_BinOp(ISD::SMAX, m_Value(), m_Value())));
EXPECT_TRUE(sd_match(SMax, m_SMax(m_Value(), m_Value())));
EXPECT_TRUE(sd_match(SMax, m_SMaxLike(m_Value(), m_Value())));
@@ -302,7 +309,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2);
SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2);
+ SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0);
+ SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0);
+
+ SDValue Ctpop = DAG->getNode(ISD::CTPOP, DL, Int32VT, Op0);
SDValue Ctlz = DAG->getNode(ISD::CTLZ, DL, Int32VT, Op0);
+ SDValue Cttz = DAG->getNode(ISD::CTTZ, DL, Int32VT, Op0);
using namespace SDPatternMatch;
EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
@@ -328,7 +340,17 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value())));
EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value())));
+ EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value())));
+ EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value())));
+ EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value())));
+ EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value())));
+
+ EXPECT_TRUE(sd_match(Ctpop, m_Ctpop(m_Value())));
EXPECT_TRUE(sd_match(Ctlz, m_Ctlz(m_Value())));
+ EXPECT_TRUE(sd_match(Cttz, m_Cttz(m_Value())));
+ EXPECT_FALSE(sd_match(Ctpop, m_Ctlz(m_Value())));
+ EXPECT_FALSE(sd_match(Ctlz, m_Cttz(m_Value())));
+ EXPECT_FALSE(sd_match(Cttz, m_Ctlz(m_Value())));
}
TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
index e745f99..03102c9 100644
--- a/llvm/unittests/Support/FormatVariadicTest.cpp
+++ b/llvm/unittests/Support/FormatVariadicTest.cpp
@@ -150,7 +150,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
EXPECT_EQ(0u, Replacements[0].Index);
EXPECT_EQ(3u, Replacements[0].Width);
EXPECT_EQ(AlignStyle::Left, Replacements[0].Where);
- EXPECT_EQ("foo", Replacements[0].Options);
+ EXPECT_EQ(" foo ", Replacements[0].Options);
// 8. Everything after the first option specifier is part of the style, even
// if it contains another option specifier.
diff --git a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
index a4c076a..9ccb139 100644
--- a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
+++ b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
@@ -103,13 +103,9 @@ public:
ModuleAnalysisManager::Invalidator &));
};
-template <typename ParamType> struct PGOTestName {
- std::string operator()(const TestParamInfo<ParamType> &Info) const {
- return std::get<1>(Info.param).str();
- }
-};
-
-struct PGOInstrumentationGenTest : public Test {
+struct PGOInstrumentationGenTest
+ : public Test,
+ WithParamInterface<std::tuple<StringRef, StringRef>> {
ModulePassManager MPM;
PassBuilder PB;
MockModuleAnalysisHandle MMAHandle;
@@ -145,47 +141,12 @@ struct PGOInstrumentationGenTest : public Test {
}
};
-struct PGOInstrumentationGenInstrumentTest
- : PGOInstrumentationGenTest,
- WithParamInterface<std::tuple<StringRef, StringRef>> {};
-
static constexpr StringRef CodeWithFuncDefs = R"(
define i32 @f(i32 %n) {
entry:
ret i32 0
})";
-INSTANTIATE_TEST_SUITE_P(
- PGOInstrumetationGenTestSuite, PGOInstrumentationGenInstrumentTest,
- Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs")),
- PGOTestName<PGOInstrumentationGenInstrumentTest::ParamType>());
-
-TEST_P(PGOInstrumentationGenInstrumentTest, Instrumented) {
- const StringRef Code = std::get<0>(GetParam());
- parseAssembly(Code);
-
- ASSERT_THAT(M, NotNull());
-
- Sequence PassSequence;
- EXPECT_CALL(MMAHandle, run(Ref(*M), _))
- .InSequence(PassSequence)
- .WillOnce(DoDefault());
- EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _))
- .InSequence(PassSequence)
- .WillOnce(DoDefault());
-
- MPM.run(*M, MAM);
-
- const auto *IRInstrVar =
- M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
- ASSERT_THAT(IRInstrVar, NotNull());
- EXPECT_FALSE(IRInstrVar->isDeclaration());
-}
-
-struct PGOInstrumentationGenIgnoreTest
- : PGOInstrumentationGenTest,
- WithParamInterface<std::tuple<StringRef, StringRef>> {};
-
static constexpr StringRef CodeWithFuncDecls = R"(
declare i32 @f(i32);
)";
@@ -196,26 +157,33 @@ static constexpr StringRef CodeWithGlobals = R"(
)";
INSTANTIATE_TEST_SUITE_P(
- PGOInstrumetationGenIgnoreTestSuite, PGOInstrumentationGenIgnoreTest,
- Values(std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"),
+ PGOInstrumetationGenTestSuite, PGOInstrumentationGenTest,
+ Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs"),
+ std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"),
std::make_tuple(CodeWithGlobals, "instrument_globals")),
- PGOTestName<PGOInstrumentationGenIgnoreTest::ParamType>());
+ [](const TestParamInfo<PGOInstrumentationGenTest::ParamType> &Info) {
+ return std::get<1>(Info.param).str();
+ });
-TEST_P(PGOInstrumentationGenIgnoreTest, NotInstrumented) {
+TEST_P(PGOInstrumentationGenTest, Instrumented) {
const StringRef Code = std::get<0>(GetParam());
-
parseAssembly(Code);
ASSERT_THAT(M, NotNull());
- EXPECT_CALL(MMAHandle, run(Ref(*M), _)).WillOnce(DoDefault());
- EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _)).Times(0);
+ Sequence PassSequence;
+ EXPECT_CALL(MMAHandle, run(Ref(*M), _))
+ .InSequence(PassSequence)
+ .WillOnce(DoDefault());
+ EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _))
+ .InSequence(PassSequence)
+ .WillOnce(DoDefault());
MPM.run(*M, MAM);
const auto *IRInstrVar =
M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
- ASSERT_THAT(IRInstrVar, NotNull());
+ EXPECT_THAT(IRInstrVar, NotNull());
EXPECT_FALSE(IRInstrVar->isDeclaration());
}
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index 780a69f..1287bdd 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -21,6 +21,7 @@ unittest("ClangAnalysisFlowSensitiveTests") {
"ASTOpsTest.cpp",
"ArenaTest.cpp",
"CFGMatchSwitchTest.cpp",
+ "CachedConstAccessorsLatticeTest.cpp",
"ChromiumCheckModelTest.cpp",
"DataflowAnalysisContextTest.cpp",
"DataflowEnvironmentTest.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
index 03e8257..f9249c2 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
@@ -1,3 +1,14 @@
+import("//lldb/utils/TableGen/lldb_tablegen.gni")
+
+lldb_tablegen("DynamicLoaderDarwinProperties") {
+ args = [ "-gen-lldb-property-defs" ]
+}
+
+lldb_tablegen("DynamicLoaderDarwinPropertiesEnum") {
+ args = [ "-gen-lldb-property-enum-defs" ]
+ td_file = "DynamicLoaderDarwinProperties.td"
+}
+
static_library("MacOSX-DYLD") {
output_name = "lldbPluginDynamicLoaderMacOSXDYLD"
configs += [
@@ -5,6 +16,8 @@ static_library("MacOSX-DYLD") {
"//llvm/utils/gn/build:lldb_code",
]
deps = [
+ ":DynamicLoaderDarwinProperties",
+ ":DynamicLoaderDarwinPropertiesEnum",
"//lldb/source/Breakpoint",
"//lldb/source/Core",
"//lldb/source/Expression",
@@ -21,6 +34,7 @@ static_library("MacOSX-DYLD") {
include_dirs = [ "//lldb/source" ]
sources = [
"DynamicLoaderDarwin.cpp",
+ "DynamicLoaderDarwinProperties.cpp",
"DynamicLoaderMacOS.cpp",
"DynamicLoaderMacOSXDYLD.cpp",
]
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index 5146c9a..85dfd77 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -16,7 +16,9 @@ static_library("Vectorize") {
"SandboxVectorizer/DependencyGraph.cpp",
"SandboxVectorizer/Interval.cpp",
"SandboxVectorizer/Passes/BottomUpVec.cpp",
+ "SandboxVectorizer/Passes/RegionsFromMetadata.cpp",
"SandboxVectorizer/SandboxVectorizer.cpp",
+ "SandboxVectorizer/SandboxVectorizerPassBuilder.cpp",
"SandboxVectorizer/SeedCollector.cpp",
"VPlan.cpp",
"VPlanAnalysis.cpp",
diff --git a/llvm/utils/lit/lit/reports.py b/llvm/utils/lit/lit/reports.py
index 2ac44b0..d2d719b 100755
--- a/llvm/utils/lit/lit/reports.py
+++ b/llvm/utils/lit/lit/reports.py
@@ -105,12 +105,20 @@ class XunitReport(object):
file.write("</testsuites>\n")
def _write_testsuite(self, file, suite, tests):
- skipped = sum(1 for t in tests if t.result.code in self.skipped_codes)
- failures = sum(1 for t in tests if t.isFailure())
+ skipped = 0
+ failures = 0
+ time = 0.0
+
+ for t in tests:
+ if t.result.code in self.skipped_codes:
+ skipped += 1
+ if t.isFailure():
+ failures += 1
+ time += t.result.elapsed
name = suite.config.name.replace(".", "-")
file.write(
- f'<testsuite name={quo(name)} tests="{len(tests)}" failures="{failures}" skipped="{skipped}">\n'
+ f'<testsuite name={quo(name)} tests="{len(tests)}" failures="{failures}" skipped="{skipped}" time="{time:.2f}">\n'
)
for test in tests:
self._write_test(file, test, name)
diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py
index 4a3d65b..3a19595 100644
--- a/llvm/utils/lit/tests/shtest-format.py
+++ b/llvm/utils/lit/tests/shtest-format.py
@@ -107,7 +107,7 @@
# XUNIT: <?xml version="1.0" encoding="UTF-8"?>
# XUNIT-NEXT: <testsuites time="{{[0-9.]+}}">
-# XUNIT-NEXT: <testsuite name="shtest-format" tests="22" failures="8" skipped="3">
+# XUNIT-NEXT: <testsuite name="shtest-format" tests="22" failures="8" skipped="3" time="{{[0-9.]+}}">
# XUNIT: <testcase classname="shtest-format.external_shell" name="fail.txt" time="{{[0-9]+\.[0-9]+}}">
# XUNIT-NEXT: <failure{{[ ]*}}>
diff --git a/llvm/utils/lit/tests/xunit-output.py b/llvm/utils/lit/tests/xunit-output.py
index 67d9984..392cded46 100644
--- a/llvm/utils/lit/tests/xunit-output.py
+++ b/llvm/utils/lit/tests/xunit-output.py
@@ -9,7 +9,7 @@
# CHECK: <?xml version="1.0" encoding="UTF-8"?>
# CHECK-NEXT: <testsuites time="{{[0-9.]+}}">
-# CHECK-NEXT: <testsuite name="test-data" tests="5" failures="1" skipped="3">
+# CHECK-NEXT: <testsuite name="test-data" tests="5" failures="1" skipped="3" time="{{[0-9.]+}}">
# CHECK-NEXT: <testcase classname="test-data.test-data" name="bad&amp;name.ini" time="{{[0-1]\.[0-9]+}}">
# CHECK-NEXT: <failure><![CDATA[& < > ]]]]><![CDATA[> &"]]></failure>
# CHECK-NEXT: </testcase>
diff --git a/llvm/utils/revert_checker.py b/llvm/utils/revert_checker.py
index da80bdf..b1c6e22 100755
--- a/llvm/utils/revert_checker.py
+++ b/llvm/utils/revert_checker.py
@@ -45,35 +45,78 @@ import logging
import re
import subprocess
import sys
-from typing import Generator, List, NamedTuple, Iterable
+from typing import Dict, Generator, Iterable, List, NamedTuple, Optional, Tuple
assert sys.version_info >= (3, 6), "Only Python 3.6+ is supported."
# People are creative with their reverts, and heuristics are a bit difficult.
-# Like 90% of of reverts have "This reverts commit ${full_sha}".
-# Some lack that entirely, while others have many of them specified in ad-hoc
-# ways, while others use short SHAs and whatever.
+# At a glance, most reverts have "This reverts commit ${full_sha}". Many others
+# have `Reverts llvm/llvm-project#${PR_NUMBER}`.
#
-# The 90% case is trivial to handle (and 100% free + automatic). The extra 10%
-# starts involving human intervention, which is probably not worth it for now.
+# By their powers combined, we should be able to automatically catch something
+# like 80% of reverts with reasonable confidence. At some point, human
+# intervention will always be required (e.g., I saw
+# ```
+# This reverts commit ${commit_sha_1} and
+# also ${commit_sha_2_shorthand}
+# ```
+# during my sample)
+
+_CommitMessageReverts = NamedTuple(
+ "_CommitMessageReverts",
+ [
+ ("potential_shas", List[str]),
+ ("potential_pr_numbers", List[int]),
+ ],
+)
+
+def _try_parse_reverts_from_commit_message(
+ commit_message: str,
+) -> _CommitMessageReverts:
+ """Tries to parse revert SHAs and LLVM PR numbers form the commit message.
-def _try_parse_reverts_from_commit_message(commit_message: str) -> List[str]:
+ Returns:
+ A namedtuple containing:
+ - A list of potentially reverted SHAs
+ - A list of potentially reverted LLVM PR numbers
+ """
if not commit_message:
- return []
+ return _CommitMessageReverts([], [])
- results = re.findall(r"This reverts commit ([a-f0-9]{40})\b", commit_message)
+ sha_reverts = re.findall(
+ r"This reverts commit ([a-f0-9]{40})\b",
+ commit_message,
+ )
first_line = commit_message.splitlines()[0]
initial_revert = re.match(r'Revert ([a-f0-9]{6,}) "', first_line)
if initial_revert:
- results.append(initial_revert.group(1))
- return results
+ sha_reverts.append(initial_revert.group(1))
+ pr_numbers = [
+ int(x)
+ for x in re.findall(
+ r"Reverts llvm/llvm-project#(\d+)",
+ commit_message,
+ )
+ ]
+
+ return _CommitMessageReverts(
+ potential_shas=sha_reverts,
+ potential_pr_numbers=pr_numbers,
+ )
-def _stream_stdout(command: List[str]) -> Generator[str, None, None]:
+
+def _stream_stdout(
+ command: List[str], cwd: Optional[str] = None
+) -> Generator[str, None, None]:
with subprocess.Popen(
- command, stdout=subprocess.PIPE, encoding="utf-8", errors="replace"
+ command,
+ cwd=cwd,
+ stdout=subprocess.PIPE,
+ encoding="utf-8",
+ errors="replace",
) as p:
assert p.stdout is not None # for mypy's happiness.
yield from p.stdout
@@ -175,10 +218,43 @@ def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str:
).strip()
-def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]:
+def _load_pr_commit_mappings(
+ git_dir: str, root: str, min_ref: str
+) -> Dict[int, List[str]]:
+ git_log = ["git", "log", "--format=%H %s", f"{min_ref}..{root}"]
+ results = collections.defaultdict(list)
+ pr_regex = re.compile(r"\s\(#(\d+)\)$")
+ for line in _stream_stdout(git_log, cwd=git_dir):
+ m = pr_regex.search(line)
+ if not m:
+ continue
+
+ pr_number = int(m.group(1))
+ sha = line.split(None, 1)[0]
+ # N.B., these are kept in log (read: reverse chronological) order,
+ # which is what's expected by `find_reverts`.
+ results[pr_number].append(sha)
+ return results
+
+
+# N.B., max_pr_lookback's default of 20K commits is arbitrary, but should be
+# enough for the 99% case of reverts: rarely should someone land a cleanish
+# revert of a >6 month old change...
+def find_reverts(
+ git_dir: str, across_ref: str, root: str, max_pr_lookback: int = 20000
+) -> List[Revert]:
"""Finds reverts across `across_ref` in `git_dir`, starting from `root`.
These reverts are returned in order of oldest reverts first.
+
+ Args:
+ git_dir: git directory to find reverts in.
+ across_ref: the ref to find reverts across.
+ root: the 'main' ref to look for reverts on.
+ max_pr_lookback: this function uses heuristics to map PR numbers to
+ SHAs. These heuristics require that commit history from `root` to
+ `some_parent_of_root` is loaded in memory. `max_pr_lookback` is how
+ many commits behind `across_ref` should be loaded in memory.
"""
across_sha = _rev_parse(git_dir, across_ref)
root_sha = _rev_parse(git_dir, root)
@@ -201,8 +277,41 @@ def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]:
)
all_reverts = []
+ # Lazily load PR <-> commit mappings, since it can be expensive.
+ pr_commit_mappings = None
for sha, commit_message in _log_stream(git_dir, root_sha, across_sha):
- reverts = _try_parse_reverts_from_commit_message(commit_message)
+ reverts, pr_reverts = _try_parse_reverts_from_commit_message(
+ commit_message,
+ )
+ if pr_reverts:
+ if pr_commit_mappings is None:
+ logging.info(
+ "Loading PR <-> commit mappings. This may take a moment..."
+ )
+ pr_commit_mappings = _load_pr_commit_mappings(
+ git_dir, root_sha, f"{across_sha}~{max_pr_lookback}"
+ )
+ logging.info(
+ "Loaded %d PR <-> commit mappings", len(pr_commit_mappings)
+ )
+
+ for reverted_pr_number in pr_reverts:
+ reverted_shas = pr_commit_mappings.get(reverted_pr_number)
+ if not reverted_shas:
+ logging.warning(
+ "No SHAs for reverted PR %d (commit %s)",
+ reverted_pr_number,
+ sha,
+ )
+ continue
+ logging.debug(
+ "Inferred SHAs %s for reverted PR %d (commit %s)",
+ reverted_shas,
+ reverted_pr_number,
+ sha,
+ )
+ reverts.extend(reverted_shas)
+
if not reverts:
continue
diff --git a/llvm/utils/revert_checker_test.py b/llvm/utils/revert_checker_test.py
index 9d99266..c149be8 100755
--- a/llvm/utils/revert_checker_test.py
+++ b/llvm/utils/revert_checker_test.py
@@ -96,6 +96,7 @@ class Test(unittest.TestCase):
git_dir=get_llvm_project_path(),
across_ref="c9944df916e41b1014dff5f6f75d52297b48ecdc~",
root="c9944df916e41b1014dff5f6f75d52297b48ecdc",
+ max_pr_lookback=50,
)
self.assertEqual(reverts, [])
@@ -113,6 +114,7 @@ class Test(unittest.TestCase):
git_dir=get_llvm_project_path(),
across_ref="c47f971694be0159ffddfee8a75ae515eba91439",
root="9f981e9adf9c8d29bb80306daf08d2770263ade6",
+ max_pr_lookback=50,
)
self.assertEqual(
reverts,
@@ -128,6 +130,27 @@ class Test(unittest.TestCase):
],
)
+ def test_pr_based_revert_works(self) -> None:
+ reverts = revert_checker.find_reverts(
+ git_dir=get_llvm_project_path(),
+ # This SHA is a direct child of the reverted SHA expected below.
+ across_ref="2d5f3b0a61fb171617012a2c3ba05fd31fb3bb1d",
+ # This SHA is a direct child of the revert SHA listed below.
+ root="2c01b278580212914ec037bb5dd9b73702dfe7f1",
+ max_pr_lookback=50,
+ )
+ self.assertEqual(
+ reverts,
+ [
+ revert_checker.Revert(
+ # This SHA is a `Reverts ${PR}` for #111004.
+ sha="50866e84d1da8462aeb96607bf6d9e5bbd5869c5",
+ # ...And this was the commit for #111004.
+ reverted_sha="67160c5ab5f5b7fd5fa7851abcfde367c8a9f91b",
+ ),
+ ],
+ )
+
if __name__ == "__main__":
unittest.main()
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index 0e38325..e81db32 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -71,6 +71,7 @@ class ArmSME_IntrOp<string mnemonic,
/*bit requiresAccessGroup=*/0,
/*bit requiresAliasAnalysis=*/0,
/*bit requiresFastmath=*/0,
+ /*bit requiresOpBundles=*/0,
/*list<int> immArgPositions=*/immArgPositions,
/*list<string> immArgAttrNames=*/immArgAttrNames>;
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index a683a90..cc5463e 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -536,6 +536,8 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> {
Option<"unknownTypeConversion", "unknown-type-conversion", "std::string",
/*default=*/"\"fully-dynamic-layout-map\"",
"Controls layout maps for non-inferrable memref types.">,
+ Option<"bufferAlignment", "buffer-alignment", "uint64_t", /*default=*/"64",
+ "Sets the alignment of newly allocated buffers.">,
];
let constructor = "mlir::bufferization::createOneShotBufferizePass()";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
index 27a2b41..ea82f7f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
@@ -59,6 +59,8 @@ def LLVM_Dialect : Dialect {
static StringRef getStructRetAttrName() { return "llvm.sret"; }
static StringRef getWriteOnlyAttrName() { return "llvm.writeonly"; }
static StringRef getZExtAttrName() { return "llvm.zeroext"; }
+ static StringRef getOpBundleSizesAttrName() { return "op_bundle_sizes"; }
+ static StringRef getOpBundleTagsAttrName() { return "op_bundle_tags"; }
// TODO Restrict the usage of this to parameter attributes once there is an
// alternative way of modeling memory effects on FunctionOpInterface.
/// Name of the attribute that will cause the creation of a readnone memory
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index ab40c8e..845c88b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -120,7 +120,8 @@ def LLVM_Log2Op : LLVM_UnaryIntrOpF<"log2">;
def LLVM_LogOp : LLVM_UnaryIntrOpF<"log">;
def LLVM_Prefetch : LLVM_ZeroResultIntrOp<"prefetch", [0],
/*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
- /*immArgPositions=*/[1, 2, 3], /*immArgAttrNames=*/["rw", "hint", "cache"]
+ /*requiresOpBundles=*/0, /*immArgPositions=*/[1, 2, 3],
+ /*immArgAttrNames=*/["rw", "hint", "cache"]
> {
let arguments = (ins LLVM_AnyPointer:$addr, I32Attr:$rw, I32Attr:$hint, I32Attr:$cache);
}
@@ -176,7 +177,8 @@ class LLVM_MemcpyIntrOpBase<string name> :
DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
/*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
- /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
+ /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
+ /*immArgAttrNames=*/["isVolatile"]> {
dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
AnySignlessInteger:$len, I1Attr:$isVolatile);
@@ -206,7 +208,8 @@ def LLVM_MemcpyInlineOp :
DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
/*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
- /*immArgPositions=*/[2, 3], /*immArgAttrNames=*/["len", "isVolatile"]> {
+ /*requiresOpBundles=*/0, /*immArgPositions=*/[2, 3],
+ /*immArgAttrNames=*/["len", "isVolatile"]> {
dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
APIntAttr:$len, I1Attr:$isVolatile);
@@ -232,7 +235,8 @@ def LLVM_MemsetOp : LLVM_ZeroResultIntrOp<"memset", [0, 2],
DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
/*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
- /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
+ /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
+ /*immArgAttrNames=*/["isVolatile"]> {
dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
I8:$val, AnySignlessInteger:$len, I1Attr:$isVolatile);
// Append the alias attributes defined by LLVM_IntrOpBase.
@@ -286,7 +290,8 @@ def LLVM_NoAliasScopeDeclOp
class LLVM_LifetimeBaseOp<string opName> : LLVM_ZeroResultIntrOp<opName, [1],
[DeclareOpInterfaceMethods<PromotableOpInterface>],
/*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
- /*immArgPositions=*/[0], /*immArgAttrNames=*/["size"]> {
+ /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
+ /*immArgAttrNames=*/["size"]> {
let arguments = (ins I64Attr:$size, LLVM_AnyPointer:$ptr);
let assemblyFormat = "$size `,` $ptr attr-dict `:` qualified(type($ptr))";
}
@@ -306,7 +311,8 @@ def LLVM_InvariantStartOp : LLVM_OneResultIntrOp<"invariant.start", [], [1],
def LLVM_InvariantEndOp : LLVM_ZeroResultIntrOp<"invariant.end", [2],
[DeclareOpInterfaceMethods<PromotableOpInterface>],
/*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
- /*immArgPositions=*/[1], /*immArgAttrNames=*/["size"]> {
+ /*requiresOpBundles=*/0, /*immArgPositions=*/[1],
+ /*immArgAttrNames=*/["size"]> {
let arguments = (ins LLVM_DefaultPointer:$start,
I64Attr:$size,
LLVM_AnyPointer:$ptr);
@@ -368,7 +374,7 @@ class LLVM_ConstrainedIntr<string mnem, int numArgs,
SmallVector<Value> mlirOperands;
SmallVector<NamedAttribute> mlirAttrs;
if (failed(moduleImport.convertIntrinsicArguments(
- llvmOperands.take_front( }] # numArgs # [{),
+ llvmOperands.take_front( }] # numArgs # [{), {}, false,
{}, {}, mlirOperands, mlirAttrs))) {
return failure();
}
@@ -429,7 +435,26 @@ def LLVM_USHLSat : LLVM_BinarySameArgsIntrOpI<"ushl.sat">;
//
def LLVM_AssumeOp
- : LLVM_ZeroResultIntrOp<"assume", []>, Arguments<(ins I1:$cond)>;
+ : LLVM_ZeroResultIntrOp<"assume", /*overloadedOperands=*/[], /*traits=*/[],
+ /*requiresAccessGroup=*/0,
+ /*requiresAliasAnalysis=*/0,
+ /*requiresOpBundles=*/1> {
+ dag args = (ins I1:$cond);
+ let arguments = !con(args, opBundleArgs);
+
+ let assemblyFormat = [{
+ $cond
+ ( custom<OpBundles>($op_bundle_operands, type($op_bundle_operands),
+ $op_bundle_tags)^ )?
+ `:` type($cond) attr-dict
+ }];
+
+ let builders = [
+ OpBuilder<(ins "Value":$cond)>
+ ];
+
+ let hasVerifier = 1;
+}
def LLVM_SSACopyOp : LLVM_OneResultIntrOp<"ssa.copy", [], [0],
[Pure, SameOperandsAndResultType]> {
@@ -992,7 +1017,8 @@ def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap">;
def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap",
/*overloadedOperands=*/[], /*traits=*/[],
/*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
- /*immArgPositions=*/[0], /*immArgAttrNames=*/["failureKind"]> {
+ /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
+ /*immArgAttrNames=*/["failureKind"]> {
let arguments = (ins I8Attr:$failureKind);
}
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index c3d352d8..a38dafa 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -291,7 +291,7 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
list<int> overloadedResults, list<int> overloadedOperands,
list<Trait> traits, int numResults,
bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0,
- bit requiresFastmath = 0,
+ bit requiresFastmath = 0, bit requiresOpBundles = 0,
list<int> immArgPositions = [],
list<string> immArgAttrNames = []>
: LLVM_OpBase<dialect, opName, !listconcat(
@@ -313,6 +313,12 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
OptionalAttr<LLVM_AliasScopeArrayAttr>:$noalias_scopes,
OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa),
(ins )));
+ dag opBundleArgs = !if(!gt(requiresOpBundles, 0),
+ (ins VariadicOfVariadic<LLVM_Type,
+ "op_bundle_sizes">:$op_bundle_operands,
+ DenseI32ArrayAttr:$op_bundle_sizes,
+ OptionalAttr<ArrayAttr>:$op_bundle_tags),
+ (ins ));
string llvmEnumName = enumName;
string overloadedResultsCpp = "{" # !interleave(overloadedResults, ", ") # "}";
string overloadedOperandsCpp = "{" # !interleave(overloadedOperands, ", ") # "}";
@@ -336,6 +342,8 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
SmallVector<NamedAttribute> mlirAttrs;
if (failed(moduleImport.convertIntrinsicArguments(
llvmOperands,
+ llvmOpBundles,
+ }] # !if(!gt(requiresOpBundles, 0), "true", "false") # [{,
}] # immArgPositionsCpp # [{,
}] # immArgAttrNamesCpp # [{,
mlirOperands,
@@ -381,12 +389,14 @@ class LLVM_IntrOp<string mnem, list<int> overloadedResults,
list<int> overloadedOperands, list<Trait> traits,
int numResults, bit requiresAccessGroup = 0,
bit requiresAliasAnalysis = 0, bit requiresFastmath = 0,
+ bit requiresOpBundles = 0,
list<int> immArgPositions = [],
list<string> immArgAttrNames = []>
: LLVM_IntrOpBase<LLVM_Dialect, "intr." # mnem, !subst(".", "_", mnem),
overloadedResults, overloadedOperands, traits,
numResults, requiresAccessGroup, requiresAliasAnalysis,
- requiresFastmath, immArgPositions, immArgAttrNames>;
+ requiresFastmath, requiresOpBundles, immArgPositions,
+ immArgAttrNames>;
// Base class for LLVM intrinsic operations returning no results. Places the
// intrinsic into the LLVM dialect and prefixes its name with "intr.".
@@ -406,11 +416,13 @@ class LLVM_ZeroResultIntrOp<string mnem, list<int> overloadedOperands = [],
list<Trait> traits = [],
bit requiresAccessGroup = 0,
bit requiresAliasAnalysis = 0,
+ bit requiresOpBundles = 0,
list<int> immArgPositions = [],
list<string> immArgAttrNames = []>
: LLVM_IntrOp<mnem, [], overloadedOperands, traits, /*numResults=*/0,
requiresAccessGroup, requiresAliasAnalysis,
- /*requiresFastMath=*/0, immArgPositions, immArgAttrNames>;
+ /*requiresFastMath=*/0, requiresOpBundles, immArgPositions,
+ immArgAttrNames>;
// Base class for LLVM intrinsic operations returning one result. Places the
// intrinsic into the LLVM dialect and prefixes its name with "intr.". This is
@@ -422,11 +434,12 @@ class LLVM_OneResultIntrOp<string mnem, list<int> overloadedResults = [],
list<int> overloadedOperands = [],
list<Trait> traits = [],
bit requiresFastmath = 0,
- list<int> immArgPositions = [],
- list<string> immArgAttrNames = []>
+ list<int> immArgPositions = [],
+ list<string> immArgAttrNames = []>
: LLVM_IntrOp<mnem, overloadedResults, overloadedOperands, traits, 1,
/*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
- requiresFastmath, immArgPositions, immArgAttrNames>;
+ requiresFastmath, /*requiresOpBundles=*/0, immArgPositions,
+ immArgAttrNames>;
def LLVM_OneResultOpBuilder :
OpBuilder<(ins "Type":$resultType, "ValueRange":$operands,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index bbca7bc..d5def51 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -559,11 +559,7 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
VariadicOfVariadic<LLVM_Type,
"op_bundle_sizes">:$op_bundle_operands,
DenseI32ArrayAttr:$op_bundle_sizes,
- DefaultValuedProperty<
- ArrayProperty<StringProperty, "operand bundle tags">,
- "ArrayRef<std::string>{}",
- "SmallVector<std::string>{}"
- >:$op_bundle_tags);
+ OptionalAttr<ArrayAttr>:$op_bundle_tags);
let results = (outs Optional<LLVM_Type>:$result);
let successors = (successor AnySuccessor:$normalDest,
AnySuccessor:$unwindDest);
@@ -678,11 +674,7 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
VariadicOfVariadic<LLVM_Type,
"op_bundle_sizes">:$op_bundle_operands,
DenseI32ArrayAttr:$op_bundle_sizes,
- DefaultValuedProperty<
- ArrayProperty<StringProperty, "operand bundle tags">,
- "ArrayRef<std::string>{}",
- "SmallVector<std::string>{}"
- >:$op_bundle_tags);
+ OptionalAttr<ArrayAttr>:$op_bundle_tags);
// Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
let arguments = !con(args, aliasAttrs);
let results = (outs Optional<LLVM_Type>:$result);
@@ -1930,11 +1922,7 @@ def LLVM_CallIntrinsicOp
VariadicOfVariadic<LLVM_Type,
"op_bundle_sizes">:$op_bundle_operands,
DenseI32ArrayAttr:$op_bundle_sizes,
- DefaultValuedProperty<
- ArrayProperty<StringProperty, "operand bundle tags">,
- "ArrayRef<std::string>{}",
- "SmallVector<std::string>{}"
- >:$op_bundle_tags);
+ OptionalAttr<ArrayAttr>:$op_bundle_tags);
let results = (outs Optional<LLVM_Type>:$results);
let llvmBuilder = [{
return convertCallLLVMIntrinsicOp(op, builder, moduleTranslation);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index c40ae4b..3695708 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -98,7 +98,7 @@ class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
LLVM_IntrOpBase<ROCDL_Dialect, mnemonic,
"amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
overloadedOperands, traits, numResults, requiresAccessGroup,
- requiresAliasAnalysis, 0, immArgPositions, immArgAttrNames>;
+ requiresAliasAnalysis, 0, 0, immArgPositions, immArgAttrNames>;
//===----------------------------------------------------------------------===//
// ROCDL special register op definitions
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 98b9151..0915bbd 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1055,13 +1055,13 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
OpBuilder<(ins "Value":$target,
"ArrayRef<int64_t>":$paddingDimensions,
CArg<"ArrayRef<int64_t>", "{}">:$staticPadToMultipleOf,
- CArg<"ArrayRef<int64_t>", "{}">:$packPaddings,
+ CArg<"ArrayRef<int64_t>", "{}">:$nofoldFlags,
CArg<"ArrayRef<Attribute>", "{}">:$transposePaddings,
CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>,
OpBuilder<(ins "Value":$target,
"ArrayRef<int64_t>":$paddingDimensions,
"ArrayRef<OpFoldResult>":$mixedPadToMultipleOf,
- CArg<"ArrayRef<int64_t>", "{}">:$packPaddings,
+ CArg<"ArrayRef<int64_t>", "{}">:$nofoldFlags,
CArg<"ArrayRef<Attribute>", "{}">:$transposePaddings,
CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>
];
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 96e0b3c..70b0866 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -297,7 +297,7 @@ struct LinalgPaddingOptions {
/// A flag for every operand to mark the PadOp as nofold which enables
/// packing for statically shaped operands.
SmallVector<bool> nofoldFlags;
- LinalgPaddingOptions &setPackPaddings(ArrayRef<bool> pp) {
+ LinalgPaddingOptions &setNofoldFlags(ArrayRef<bool> pp) {
nofoldFlags.assign(pp.begin(), pp.end());
return *this;
}
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 07402c8..3bb5ceb 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -1877,21 +1877,23 @@ def Tosa_RescaleOp: Tosa_Op<"rescale", [Pure,
let description = [{
Rescale quantized values into a new domain. Supported rescalings are:
- Mode Input Output
- signed 8 to 8 int8 int8
- signed 8 to 16 int8 int16
- signed 8 to 32 int8 int32
- signed 16 to 8 int16 int8
- signed 16 to 16 int16 int16
- signed 16 to 32 int16 int32
- signed 32 to 8 int32 int8
- signed 32 to 16 int32 int16
- signed 32 to 32 int32 int32
- signed 48 to 8 int48 int8
- signed 48 to 16 int48 int16
- signed 48 to 32 int48 int32
- unsigned 8 to signed 8 uint8 int8
- signed 8 to unsigned 8 int8 uint8
+
+ | Mode | Input | Output |
+ |------------------------|-------|--------|
+ | signed 8 to 8 | int8 | int8 |
+ | signed 8 to 16 | int8 | int16 |
+ | signed 8 to 32 | int8 | int32 |
+ | signed 16 to 8 | int16 | int8 |
+ | signed 16 to 16 | int16 | int16 |
+ | signed 16 to 32 | int16 | int32 |
+ | signed 32 to 8 | int32 | int8 |
+ | signed 32 to 16 | int32 | int16 |
+ | signed 32 to 32 | int32 | int32 |
+ | signed 48 to 8 | int48 | int8 |
+ | signed 48 to 16 | int48 | int16 |
+ | signed 48 to 32 | int48 | int32 |
+ | unsigned 8 to signed 8 | uint8 | int8 |
+ | signed 8 to unsigned 8 | int8 | uint8 |
}];
let arguments = (ins
diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.h b/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
index 47bcfc9..4fcbeff 100644
--- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
+++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
@@ -244,6 +244,10 @@ inferReturnTensorTypes(ArrayRef<ShapedTypeComponents> retComponents,
/// Verifies that the inferred result types match the actual result types for
/// the op. Precondition: op implements InferTypeOpInterface.
LogicalResult verifyInferredResultTypes(Operation *op);
+
+/// Report a fatal error indicating that the result types could not be
+/// inferred.
+void reportFatalInferReturnTypesError(OperationState &state);
} // namespace detail
namespace OpTrait {
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index 9f300bc..bbb7af5 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -243,6 +243,8 @@ public:
/// corresponding MLIR attribute names.
LogicalResult
convertIntrinsicArguments(ArrayRef<llvm::Value *> values,
+ ArrayRef<llvm::OperandBundleUse> opBundles,
+ bool requiresOpBundles,
ArrayRef<unsigned> immArgPositions,
ArrayRef<StringLiteral> immArgAttrNames,
SmallVectorImpl<Value> &valuesOut,
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index e628fb1..0d6ff2f 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -892,8 +892,8 @@ FlatLinearValueConstraints::FlatLinearValueConstraints(IntegerSet set,
set.getNumDims() + set.getNumSymbols() + 1,
set.getNumDims(), set.getNumSymbols(),
/*numLocals=*/0) {
- assert(operands.empty() ||
- set.getNumInputs() == operands.size() && "operand count mismatch");
+ assert((operands.empty() || set.getNumInputs() == operands.size()) &&
+ "operand count mismatch");
// Set the values for the non-local variables.
for (unsigned i = 0, e = operands.size(); i < e; ++i)
setValue(i, operands[i]);
diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp
index 4be0e06..fddd7c5 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp
@@ -40,11 +40,11 @@ arith::NarrowTypeEmulationConverter::NarrowTypeEmulationConverter(
addConversion([this](FunctionType ty) -> std::optional<Type> {
SmallVector<Type> inputs;
if (failed(convertTypes(ty.getInputs(), inputs)))
- return std::nullopt;
+ return nullptr;
SmallVector<Type> results;
if (failed(convertTypes(ty.getResults(), results)))
- return std::nullopt;
+ return nullptr;
return FunctionType::get(ty.getContext(), inputs, results);
});
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 875d8c4..1d009b0 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -224,6 +224,7 @@ struct OneShotBufferizePass
};
}
opt.printConflicts = printConflicts;
+ opt.bufferAlignment = bufferAlignment;
opt.testAnalysisOnly = testAnalysisOnly;
opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
opt.checkParallelRegions = checkParallelRegions;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 12ed8cc..cc73878 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -241,13 +241,18 @@ static void printOneOpBundle(OpAsmPrinter &p, OperandRange operands,
static void printOpBundles(OpAsmPrinter &p, Operation *op,
OperandRangeRange opBundleOperands,
TypeRangeRange opBundleOperandTypes,
- ArrayRef<std::string> opBundleTags) {
+ std::optional<ArrayAttr> opBundleTags) {
+ if (opBundleOperands.empty())
+ return;
+ assert(opBundleTags && "expect operand bundle tags");
+
p << "[";
llvm::interleaveComma(
- llvm::zip(opBundleOperands, opBundleOperandTypes, opBundleTags), p,
+ llvm::zip(opBundleOperands, opBundleOperandTypes, *opBundleTags), p,
[&p](auto bundle) {
+ auto bundleTag = cast<StringAttr>(std::get<2>(bundle)).getValue();
printOneOpBundle(p, std::get<0>(bundle), std::get<1>(bundle),
- std::get<2>(bundle));
+ bundleTag);
});
p << "]";
}
@@ -256,7 +261,7 @@ static ParseResult parseOneOpBundle(
OpAsmParser &p,
SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
SmallVector<SmallVector<Type>> &opBundleOperandTypes,
- SmallVector<std::string> &opBundleTags) {
+ SmallVector<Attribute> &opBundleTags) {
SMLoc currentParserLoc = p.getCurrentLocation();
SmallVector<OpAsmParser::UnresolvedOperand> operands;
SmallVector<Type> types;
@@ -276,7 +281,7 @@ static ParseResult parseOneOpBundle(
opBundleOperands.push_back(std::move(operands));
opBundleOperandTypes.push_back(std::move(types));
- opBundleTags.push_back(std::move(tag));
+ opBundleTags.push_back(StringAttr::get(p.getContext(), tag));
return success();
}
@@ -285,16 +290,17 @@ static std::optional<ParseResult> parseOpBundles(
OpAsmParser &p,
SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
SmallVector<SmallVector<Type>> &opBundleOperandTypes,
- SmallVector<std::string> &opBundleTags) {
+ ArrayAttr &opBundleTags) {
if (p.parseOptionalLSquare())
return std::nullopt;
if (succeeded(p.parseOptionalRSquare()))
return success();
+ SmallVector<Attribute> opBundleTagAttrs;
auto bundleParser = [&] {
return parseOneOpBundle(p, opBundleOperands, opBundleOperandTypes,
- opBundleTags);
+ opBundleTagAttrs);
};
if (p.parseCommaSeparatedList(bundleParser))
return failure();
@@ -302,6 +308,8 @@ static std::optional<ParseResult> parseOpBundles(
if (p.parseRSquare())
return failure();
+ opBundleTags = ArrayAttr::get(p.getContext(), opBundleTagAttrs);
+
return success();
}
@@ -1039,7 +1047,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
/*CConv=*/nullptr, /*TailCallKind=*/nullptr,
/*memory_effects=*/nullptr,
/*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
- /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+ /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
/*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
/*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
}
@@ -1066,7 +1074,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
/*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
/*convergent=*/nullptr,
/*no_unwind=*/nullptr, /*will_return=*/nullptr,
- /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+ /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
/*access_groups=*/nullptr,
/*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
}
@@ -1079,7 +1087,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
/*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
/*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
/*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
- /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+ /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
/*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
/*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
}
@@ -1092,7 +1100,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
/*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
/*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
/*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
- /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+ /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
/*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
/*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
}
@@ -1192,12 +1200,20 @@ LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
template <typename OpType>
static LogicalResult verifyOperandBundles(OpType &op) {
OperandRangeRange opBundleOperands = op.getOpBundleOperands();
- ArrayRef<std::string> opBundleTags = op.getOpBundleTags();
+ std::optional<ArrayAttr> opBundleTags = op.getOpBundleTags();
- if (opBundleTags.size() != opBundleOperands.size())
+ auto isStringAttr = [](Attribute tagAttr) {
+ return isa<StringAttr>(tagAttr);
+ };
+ if (opBundleTags && !llvm::all_of(*opBundleTags, isStringAttr))
+ return op.emitError("operand bundle tag must be a StringAttr");
+
+ size_t numOpBundles = opBundleOperands.size();
+ size_t numOpBundleTags = opBundleTags ? opBundleTags->size() : 0;
+ if (numOpBundles != numOpBundleTags)
return op.emitError("expected ")
- << opBundleOperands.size()
- << " operand bundle tags, but actually got " << opBundleTags.size();
+ << numOpBundles << " operand bundle tags, but actually got "
+ << numOpBundleTags;
return success();
}
@@ -1329,7 +1345,8 @@ void CallOp::print(OpAsmPrinter &p) {
{getCalleeAttrName(), getTailCallKindAttrName(),
getVarCalleeTypeAttrName(), getCConvAttrName(),
getOperandSegmentSizesAttrName(),
- getOpBundleSizesAttrName()});
+ getOpBundleSizesAttrName(),
+ getOpBundleTagsAttrName()});
p << " : ";
if (!isDirect)
@@ -1437,7 +1454,7 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
SmallVector<OpAsmParser::UnresolvedOperand> operands;
SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
SmallVector<SmallVector<Type>> opBundleOperandTypes;
- SmallVector<std::string> opBundleTags;
+ ArrayAttr opBundleTags;
// Default to C Calling Convention if no keyword is provided.
result.addAttribute(
@@ -1483,9 +1500,9 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
result && failed(*result))
return failure();
- if (!opBundleTags.empty())
- result.getOrAddProperties<CallOp::Properties>().op_bundle_tags =
- std::move(opBundleTags);
+ if (opBundleTags && !opBundleTags.empty())
+ result.addAttribute(CallOp::getOpBundleTagsAttrName(result.name).getValue(),
+ opBundleTags);
if (parser.parseOptionalAttrDict(result.attributes))
return failure();
@@ -1525,8 +1542,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
auto calleeType = func.getFunctionType();
build(builder, state, getCallOpResultTypes(calleeType),
getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
- normalOps, unwindOps, nullptr, nullptr, {}, std::nullopt, normal,
- unwind);
+ normalOps, unwindOps, nullptr, nullptr, {}, {}, normal, unwind);
}
void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1535,7 +1551,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
ValueRange unwindOps) {
build(builder, state, tys,
/*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
- nullptr, {}, std::nullopt, normal, unwind);
+ nullptr, {}, {}, normal, unwind);
}
void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1544,7 +1560,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
Block *unwind, ValueRange unwindOps) {
build(builder, state, getCallOpResultTypes(calleeType),
getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
- nullptr, nullptr, {}, std::nullopt, normal, unwind);
+ nullptr, nullptr, {}, {}, normal, unwind);
}
SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1634,7 +1650,8 @@ void InvokeOp::print(OpAsmPrinter &p) {
p.printOptionalAttrDict((*this)->getAttrs(),
{getCalleeAttrName(), getOperandSegmentSizeAttr(),
getCConvAttrName(), getVarCalleeTypeAttrName(),
- getOpBundleSizesAttrName()});
+ getOpBundleSizesAttrName(),
+ getOpBundleTagsAttrName()});
p << " : ";
if (!isDirect)
@@ -1657,7 +1674,7 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
TypeAttr varCalleeType;
SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
SmallVector<SmallVector<Type>> opBundleOperandTypes;
- SmallVector<std::string> opBundleTags;
+ ArrayAttr opBundleTags;
Block *normalDest, *unwindDest;
SmallVector<Value, 4> normalOperands, unwindOperands;
Builder &builder = parser.getBuilder();
@@ -1703,9 +1720,10 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
result && failed(*result))
return failure();
- if (!opBundleTags.empty())
- result.getOrAddProperties<InvokeOp::Properties>().op_bundle_tags =
- std::move(opBundleTags);
+ if (opBundleTags && !opBundleTags.empty())
+ result.addAttribute(
+ InvokeOp::getOpBundleTagsAttrName(result.name).getValue(),
+ opBundleTags);
if (parser.parseOptionalAttrDict(result.attributes))
return failure();
@@ -3333,7 +3351,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
mlir::StringAttr intrin, mlir::ValueRange args) {
build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
FastmathFlagsAttr{},
- /*op_bundle_operands=*/{});
+ /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
}
void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3341,14 +3359,14 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
fastMathFlags,
- /*op_bundle_operands=*/{});
+ /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
}
void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
mlir::Type resultType, mlir::StringAttr intrin,
mlir::ValueRange args) {
build(builder, state, {resultType}, intrin, args, FastmathFlagsAttr{},
- /*op_bundle_operands=*/{});
+ /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
}
void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3356,7 +3374,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
mlir::StringAttr intrin, mlir::ValueRange args,
mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
build(builder, state, resultTypes, intrin, args, fastMathFlags,
- /*op_bundle_operands=*/{});
+ /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
}
//===----------------------------------------------------------------------===//
@@ -3414,6 +3432,18 @@ void InlineAsmOp::getEffects(
}
//===----------------------------------------------------------------------===//
+// AssumeOp (intrinsic)
+//===----------------------------------------------------------------------===//
+
+void LLVM::AssumeOp::build(OpBuilder &builder, OperationState &state,
+ mlir::Value cond) {
+ return build(builder, state, cond, /*op_bundle_operands=*/{},
+ /*op_bundle_tags=*/{});
+}
+
+LogicalResult LLVM::AssumeOp::verify() { return verifyOperandBundles(*this); }
+
+//===----------------------------------------------------------------------===//
// masked_gather (intrinsic)
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 09c6b26..635273b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -840,11 +840,11 @@ enum VectorMemoryAccessKind { ScalarBroadcast, Contiguous, Gather };
/// TODO: Statically shaped loops + vector masking
static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) {
SmallVector<int64_t> loopRanges = linalgOp.getStaticLoopRanges();
- assert(linalgOp.hasDynamicShape() ||
- llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) ==
- 1 &&
- "For statically shaped Linalg Ops, only one "
- "non-unit loop dim is expected");
+ assert(
+ (linalgOp.hasDynamicShape() ||
+ llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) == 1) &&
+ "For statically shaped Linalg Ops, only one "
+ "non-unit loop dim is expected");
size_t idx = loopRanges.size() - 1;
for (; idx >= 0; idx--)
diff --git a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
index 40c83487..27e89d6 100644
--- a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
+++ b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
@@ -148,8 +148,9 @@ void MLProgramPipelineGlobals::processBlock(
if (auto store = mlir::dyn_cast<GlobalStoreOp>(op)) {
auto ref = store.getGlobal();
symbolStore.insert(ref);
- if (previousStores.contains(ref)) {
- toDelete.push_back(previousStores.find(ref)->getSecond());
+ auto it = previousStores.find(ref);
+ if (it != previousStores.end()) {
+ toDelete.push_back(it->getSecond());
}
previousLoads[ref] = store.getValue();
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
index 9efea06..28f9061 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
@@ -169,8 +169,9 @@ struct ConvertMemRefAllocation final : OpConversionPattern<OpTy> {
std::is_same<OpTy, memref::AllocaOp>(),
"expected only memref::AllocOp or memref::AllocaOp");
auto currentType = cast<MemRefType>(op.getMemref().getType());
- auto newResultType = dyn_cast<MemRefType>(
- this->getTypeConverter()->convertType(op.getType()));
+ auto newResultType =
+ this->getTypeConverter()->template convertType<MemRefType>(
+ op.getType());
if (!newResultType) {
return rewriter.notifyMatchFailure(
op->getLoc(),
@@ -378,7 +379,7 @@ struct ConvertMemRefReinterpretCast final
matchAndRewrite(memref::ReinterpretCastOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
MemRefType newTy =
- dyn_cast<MemRefType>(getTypeConverter()->convertType(op.getType()));
+ getTypeConverter()->convertType<MemRefType>(op.getType());
if (!newTy) {
return rewriter.notifyMatchFailure(
op->getLoc(),
@@ -466,8 +467,8 @@ struct ConvertMemRefSubview final : OpConversionPattern<memref::SubViewOp> {
LogicalResult
matchAndRewrite(memref::SubViewOp subViewOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- MemRefType newTy = dyn_cast<MemRefType>(
- getTypeConverter()->convertType(subViewOp.getType()));
+ MemRefType newTy =
+ getTypeConverter()->convertType<MemRefType>(subViewOp.getType());
if (!newTy) {
return rewriter.notifyMatchFailure(
subViewOp->getLoc(),
@@ -632,14 +633,14 @@ void memref::populateMemRefNarrowTypeEmulationConversions(
SmallVector<int64_t> strides;
int64_t offset;
if (failed(getStridesAndOffset(ty, strides, offset)))
- return std::nullopt;
+ return nullptr;
if (!strides.empty() && strides.back() != 1)
- return std::nullopt;
+ return nullptr;
auto newElemTy = IntegerType::get(ty.getContext(), loadStoreWidth,
intTy.getSignedness());
if (!newElemTy)
- return std::nullopt;
+ return nullptr;
StridedLayoutAttr layoutAttr;
// If the offset is 0, we do not need a strided layout as the stride is
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
index bc4535f..49b7162 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
@@ -159,7 +159,7 @@ void memref::populateMemRefWideIntEmulationConversions(
Type newElemTy = typeConverter.convertType(intTy);
if (!newElemTy)
- return std::nullopt;
+ return nullptr;
return ty.cloneWith(std::nullopt, newElemTy);
});
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index c6c6edb..3217542 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2012,14 +2012,16 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
const SimdOperands &clauses) {
MLIRContext *ctx = builder.getContext();
// TODO Store clauses in op: linearVars, linearStepVars, privateVars,
- // privateSyms, reductionVars, reductionByref, reductionSyms.
+ // privateSyms.
SimdOp::build(builder, state, clauses.alignedVars,
makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr,
/*linear_vars=*/{}, /*linear_step_vars=*/{},
clauses.nontemporalVars, clauses.order, clauses.orderMod,
/*private_vars=*/{}, /*private_syms=*/nullptr,
- /*reduction_vars=*/{}, /*reduction_byref=*/nullptr,
- /*reduction_syms=*/nullptr, clauses.safelen, clauses.simdlen);
+ clauses.reductionVars,
+ makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+ makeArrayAttr(ctx, clauses.reductionSyms), clauses.safelen,
+ clauses.simdlen);
}
LogicalResult SimdOp::verify() {
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index e0b91f3..5c16e53 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -27,9 +27,9 @@ PadOp mlir::tensor::createPadHighOp(RankedTensorType resType, Value source,
OpBuilder &b,
SmallVector<Value> dynOutDims) {
- assert((resType.getNumDynamicDims() == dynOutDims.size()) ||
- dynOutDims.empty() &&
- "Either none or all output dynamic dims must be specified!");
+ assert(((resType.getNumDynamicDims() == dynOutDims.size()) ||
+ dynOutDims.empty()) &&
+ "Either none or all output dynamic dims must be specified!");
// Init "low" and "high" padding values ("low" is kept as is, "high" is
// computed below).
diff --git a/mlir/lib/Interfaces/InferTypeOpInterface.cpp b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
index e52d0e1..8cc4206 100644
--- a/mlir/lib/Interfaces/InferTypeOpInterface.cpp
+++ b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
@@ -247,3 +247,17 @@ LogicalResult mlir::detail::verifyInferredResultTypes(Operation *op) {
return result;
}
+
+void mlir::detail::reportFatalInferReturnTypesError(OperationState &state) {
+ std::string buffer;
+ llvm::raw_string_ostream os(buffer);
+ os << "Failed to infer result type(s):\n";
+ os << "\"" << state.name << "\"(...) ";
+ os << state.attributes.getDictionary(state.location.getContext());
+ os << " : (";
+ llvm::interleaveComma(state.operands, os,
+ [&](Value val) { os << val.getType(); });
+ os << ") -> ( ??? )";
+ emitRemark(state.location, "location of op");
+ llvm::report_fatal_error(llvm::StringRef(buffer));
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index d034e57..4fd043c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -68,6 +68,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
if (isConvertibleIntrinsic(intrinsicID)) {
SmallVector<llvm::Value *> args(inst->args());
ArrayRef<llvm::Value *> llvmOperands(args);
+
+ SmallVector<llvm::OperandBundleUse> llvmOpBundles;
+ llvmOpBundles.reserve(inst->getNumOperandBundles());
+ for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
+ llvmOpBundles.push_back(inst->getOperandBundleAt(i));
+
#include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc"
}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index a8595d1..2084e527 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -114,17 +114,27 @@ convertOperandBundle(OperandRange bundleOperands, StringRef bundleTag,
}
static SmallVector<llvm::OperandBundleDef>
-convertOperandBundles(OperandRangeRange bundleOperands,
- ArrayRef<std::string> bundleTags,
+convertOperandBundles(OperandRangeRange bundleOperands, ArrayAttr bundleTags,
LLVM::ModuleTranslation &moduleTranslation) {
SmallVector<llvm::OperandBundleDef> bundles;
bundles.reserve(bundleOperands.size());
- for (auto [operands, tag] : llvm::zip_equal(bundleOperands, bundleTags))
+ for (auto [operands, tagAttr] : llvm::zip_equal(bundleOperands, bundleTags)) {
+ StringRef tag = cast<StringAttr>(tagAttr).getValue();
bundles.push_back(convertOperandBundle(operands, tag, moduleTranslation));
+ }
return bundles;
}
+static SmallVector<llvm::OperandBundleDef>
+convertOperandBundles(OperandRangeRange bundleOperands,
+ std::optional<ArrayAttr> bundleTags,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ if (!bundleTags)
+ return {};
+ return convertOperandBundles(bundleOperands, *bundleTags, moduleTranslation);
+}
+
/// Builder for LLVM_CallIntrinsicOp
static LogicalResult
convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index bc830a7..2c0b665 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -50,6 +50,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
if (isConvertibleIntrinsic(intrinsicID)) {
SmallVector<llvm::Value *> args(inst->args());
ArrayRef<llvm::Value *> llvmOperands(args);
+
+ SmallVector<llvm::OperandBundleUse> llvmOpBundles;
+ llvmOpBundles.reserve(inst->getNumOperandBundles());
+ for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
+ llvmOpBundles.push_back(inst->getOperandBundleAt(i));
+
#include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 4a575f4..7c45e89 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -371,20 +371,46 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
return success();
}
-/// Populates `reductions` with reduction declarations used in the given loop.
+/// Looks up from the operation from and returns the PrivateClauseOp with
+/// name symbolName
+static omp::PrivateClauseOp findPrivatizer(Operation *from,
+ SymbolRefAttr symbolName) {
+ omp::PrivateClauseOp privatizer =
+ SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(from,
+ symbolName);
+ assert(privatizer && "privatizer not found in the symbol table");
+ return privatizer;
+}
+
+/// Populates `privatizations` with privatization declarations used for the
+/// given op.
+/// TODO: generalise beyond ParallelOp
+static void collectPrivatizationDecls(
+ omp::ParallelOp op, SmallVectorImpl<omp::PrivateClauseOp> &privatizations) {
+ std::optional<ArrayAttr> attr = op.getPrivateSyms();
+ if (!attr)
+ return;
+
+ privatizations.reserve(privatizations.size() + attr->size());
+ for (auto symbolRef : attr->getAsRange<SymbolRefAttr>()) {
+ privatizations.push_back(findPrivatizer(op, symbolRef));
+ }
+}
+
+/// Populates `reductions` with reduction declarations used in the given op.
template <typename T>
static void
-collectReductionDecls(T loop,
+collectReductionDecls(T op,
SmallVectorImpl<omp::DeclareReductionOp> &reductions) {
- std::optional<ArrayAttr> attr = loop.getReductionSyms();
+ std::optional<ArrayAttr> attr = op.getReductionSyms();
if (!attr)
return;
- reductions.reserve(reductions.size() + loop.getNumReductionVars());
+ reductions.reserve(reductions.size() + op.getNumReductionVars());
for (auto symbolRef : attr->getAsRange<SymbolRefAttr>()) {
reductions.push_back(
SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>(
- loop, symbolRef));
+ op, symbolRef));
}
}
@@ -609,7 +635,7 @@ static LogicalResult
allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
- llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+ const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
SmallVectorImpl<llvm::Value *> &privateReductionVariables,
DenseMap<Value, llvm::Value *> &reductionVariableMap,
@@ -1317,76 +1343,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
privateReductionVariables, isByRef);
}
-/// A RAII class that on construction replaces the region arguments of the
-/// parallel op (which correspond to private variables) with the actual private
-/// variables they correspond to. This prepares the parallel op so that it
-/// matches what is expected by the OMPIRBuilder.
-///
-/// On destruction, it restores the original state of the operation so that on
-/// the MLIR side, the op is not affected by conversion to LLVM IR.
-class OmpParallelOpConversionManager {
-public:
- OmpParallelOpConversionManager(omp::ParallelOp opInst)
- : region(opInst.getRegion()),
- privateBlockArgs(cast<omp::BlockArgOpenMPOpInterface>(*opInst)
- .getPrivateBlockArgs()),
- privateVars(opInst.getPrivateVars()) {
- for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars))
- mlir::replaceAllUsesInRegionWith(blockArg, var, region);
- }
-
- ~OmpParallelOpConversionManager() {
- for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars))
- mlir::replaceAllUsesInRegionWith(var, blockArg, region);
- }
-
-private:
- Region &region;
- llvm::MutableArrayRef<BlockArgument> privateBlockArgs;
- OperandRange privateVars;
-};
-
-// Looks up from the operation from and returns the PrivateClauseOp with
-// name symbolName
-static omp::PrivateClauseOp findPrivatizer(Operation *from,
- SymbolRefAttr symbolName) {
- omp::PrivateClauseOp privatizer =
- SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(from,
- symbolName);
- assert(privatizer && "privatizer not found in the symbol table");
- return privatizer;
-}
-// clones the given privatizer. The original privatizer is used as
-// the insert point for the clone.
-static omp::PrivateClauseOp
-clonePrivatizer(LLVM::ModuleTranslation &moduleTranslation,
- omp::PrivateClauseOp privatizer, Operation *fromOperation) {
- MLIRContext &context = moduleTranslation.getContext();
- mlir::IRRewriter opCloner(&context);
- opCloner.setInsertionPoint(privatizer);
- auto clone =
- llvm::cast<mlir::omp::PrivateClauseOp>(opCloner.clone(*privatizer));
-
- // Unique the clone name to avoid clashes in the symbol table.
- unsigned counter = 0;
- SmallString<256> cloneName = SymbolTable::generateSymbolName<256>(
- privatizer.getSymName(),
- [&](llvm::StringRef candidate) {
- return SymbolTable::lookupNearestSymbolFrom(
- fromOperation, StringAttr::get(&context, candidate)) !=
- nullptr;
- },
- counter);
-
- clone.setSymName(cloneName);
- return clone;
-}
/// Converts the OpenMP parallel operation to LLVM IR.
static LogicalResult
convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) {
using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
- OmpParallelOpConversionManager raii(opInst);
ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionByref());
assert(isByRef.size() == opInst.getNumReductionVars());
@@ -1395,6 +1356,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
LogicalResult bodyGenStatus = success();
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+ // Collect delayed privatization declarations
+ MutableArrayRef<BlockArgument> privateBlockArgs =
+ cast<omp::BlockArgOpenMPOpInterface>(*opInst).getPrivateBlockArgs();
+ SmallVector<llvm::Value *> llvmPrivateVars;
+ SmallVector<omp::PrivateClauseOp> privateDecls;
+ llvmPrivateVars.reserve(privateBlockArgs.size());
+ privateDecls.reserve(privateBlockArgs.size());
+ collectPrivatizationDecls(opInst, privateDecls);
+
// Collect reduction declarations
SmallVector<omp::DeclareReductionOp> reductionDecls;
collectReductionDecls(opInst, reductionDecls);
@@ -1403,6 +1373,66 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
SmallVector<DeferredStore> deferredStores;
auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+ // Allocate private vars
+ llvm::BranchInst *allocaTerminator =
+ llvm::cast<llvm::BranchInst>(allocaIP.getBlock()->getTerminator());
+ builder.SetInsertPoint(allocaTerminator);
+ assert(allocaTerminator->getNumSuccessors() == 1 &&
+ "This is an unconditional branch created by OpenMPIRBuilder");
+ llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
+
+ // FIXME: Some of the allocation regions do more than just allocating.
+ // They read from their block argument (amongst other non-alloca things).
+ // When OpenMPIRBuilder outlines the parallel region into a different
+ // function it places the loads for live in-values (such as these block
+ // arguments) at the end of the entry block (because the entry block is
+ // assumed to contain only allocas). Therefore, if we put these complicated
+ // alloc blocks in the entry block, these will not dominate the availability
+ // of the live-in values they are using. Fix this by adding a latealloc
+ // block after the entry block to put these in (this also helps to avoid
+ // mixing non-alloca code with allocas).
+ // Alloc regions which do not use the block argument can still be placed in
+ // the entry block (therefore keeping the allocas together).
+ llvm::BasicBlock *privAllocBlock = nullptr;
+ if (!privateBlockArgs.empty())
+ privAllocBlock = splitBB(builder, true, "omp.private.latealloc");
+ for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
+ Region &allocRegion = privateDecls[i].getAllocRegion();
+
+ // map allocation region block argument
+ llvm::Value *nonPrivateVar =
+ moduleTranslation.lookupValue(opInst.getPrivateVars()[i]);
+ assert(nonPrivateVar);
+ moduleTranslation.mapValue(privateDecls[i].getAllocMoldArg(),
+ nonPrivateVar);
+
+ // in-place convert the private allocation region
+ SmallVector<llvm::Value *, 1> phis;
+ if (privateDecls[i].getAllocMoldArg().getUses().empty()) {
+ // TODO this should use
+ // allocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca() so it goes before
+ // the code for fetching the thread id. Not doing this for now to avoid
+ // test churn.
+ builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+ } else {
+ builder.SetInsertPoint(privAllocBlock->getTerminator());
+ }
+ if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc",
+ builder, moduleTranslation, &phis))) {
+ bodyGenStatus = failure();
+ return;
+ }
+ assert(phis.size() == 1 && "expected one allocation to be yielded");
+
+ moduleTranslation.mapValue(privateBlockArgs[i], phis[0]);
+ llvmPrivateVars.push_back(phis[0]);
+
+ // clear alloc region block argument mapping in case it needs to be
+ // re-created with a different source for another use of the same
+ // reduction decl
+ moduleTranslation.forgetMapping(allocRegion);
+ }
+
// Allocate reduction vars
DenseMap<Value, llvm::Value *> reductionVariableMap;
@@ -1419,12 +1449,64 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
deferredStores, isByRef)))
bodyGenStatus = failure();
+ // Apply copy region for firstprivate.
+ bool needsFirstprivate =
+ llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) {
+ return privOp.getDataSharingType() ==
+ omp::DataSharingClauseType::FirstPrivate;
+ });
+ if (needsFirstprivate) {
+ // Find the end of the allocation blocks
+ assert(afterAllocas->getSinglePredecessor());
+ builder.SetInsertPoint(
+ afterAllocas->getSinglePredecessor()->getTerminator());
+ llvm::BasicBlock *copyBlock =
+ splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
+ builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());
+ }
+ for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
+ if (privateDecls[i].getDataSharingType() !=
+ omp::DataSharingClauseType::FirstPrivate)
+ continue;
+
+ // copyRegion implements `lhs = rhs`
+ Region &copyRegion = privateDecls[i].getCopyRegion();
+
+ // map copyRegion rhs arg
+ llvm::Value *nonPrivateVar =
+ moduleTranslation.lookupValue(opInst.getPrivateVars()[i]);
+ assert(nonPrivateVar);
+ moduleTranslation.mapValue(privateDecls[i].getCopyMoldArg(),
+ nonPrivateVar);
+
+ // map copyRegion lhs arg
+ moduleTranslation.mapValue(privateDecls[i].getCopyPrivateArg(),
+ llvmPrivateVars[i]);
+
+ // in-place convert copy region
+ builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+ if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy",
+ builder, moduleTranslation))) {
+ bodyGenStatus = failure();
+ return;
+ }
+
+ // ignore unused value yielded from copy region
+
+ // clear copy region block argument mapping in case it needs to be
+ // re-created with different sources for reuse of the same reduction
+ // decl
+ moduleTranslation.forgetMapping(copyRegion);
+ }
+
// Initialize reduction vars
- builder.restoreIP(allocaIP);
+ builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
allocaIP =
InsertPointTy(allocaIP.getBlock(),
allocaIP.getBlock()->getTerminator()->getIterator());
+
+ builder.restoreIP(allocaIP);
SmallVector<llvm::Value *> byRefVars(opInst.getNumReductionVars());
for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
if (isByRef[i]) {
@@ -1534,183 +1616,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
}
};
- SmallVector<omp::PrivateClauseOp> mlirPrivatizerClones;
- SmallVector<llvm::Value *> llvmPrivateVars;
-
- // TODO: Perform appropriate actions according to the data-sharing
- // attribute (shared, private, firstprivate, ...) of variables.
- // Currently shared and private are supported.
- auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
- llvm::Value &, llvm::Value &llvmOmpRegionInput,
- llvm::Value *&llvmReplacementValue) -> InsertPointTy {
- llvmReplacementValue = &llvmOmpRegionInput;
-
- // If this is a private value, this lambda will return the corresponding
- // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
- // returned.
- auto [mlirPrivVar, mlirPrivatizerClone] =
- [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
- if (!opInst.getPrivateVars().empty()) {
- auto mlirPrivVars = opInst.getPrivateVars();
- auto mlirPrivSyms = opInst.getPrivateSyms();
-
- // Try to find a privatizer that corresponds to the LLVM value being
- // privatized.
- for (auto [mlirPrivVar, mlirPrivatizerAttr] :
- llvm::zip_equal(mlirPrivVars, *mlirPrivSyms)) {
- // Find the MLIR private variable corresponding to the LLVM value
- // being privatized.
- llvm::Value *mlirToLLVMPrivVar =
- moduleTranslation.lookupValue(mlirPrivVar);
-
- // Check if the LLVM value being privatized matches the LLVM value
- // mapped to privVar. In some cases, this is not trivial ...
- auto isMatch = [&]() {
- if (mlirToLLVMPrivVar == nullptr)
- return false;
-
- // If both values are trivially equal, we found a match.
- if (mlirToLLVMPrivVar == &llvmOmpRegionInput)
- return true;
-
- // Otherwise, we check if both llvmOmpRegionInputPtr and
- // mlirToLLVMPrivVar refer to the same memory (through a load/store
- // pair). This happens if a struct (i.e. multi-field value) is being
- // privatized.
- //
- // For example, if the privatized value is defined by:
- // ```
- // %priv_val = alloca { ptr, i64 }, align 8
- // ```
- //
- // The initialization of this value (outside the omp region) will be
- // something like this:
- //
- // clang-format off
- // ```
- // %partially_init_priv_val = insertvalue { ptr, i64 } undef,
- // ptr %some_ptr, 0
- // %fully_init_priv_val = insertvalue { ptr, i64 } %partially_init_priv_val,
- // i64 %some_i64, 1
- // ...
- // store { ptr, i64 } %fully_init_priv_val, ptr %priv_val, align 8
- // ```
- // clang-format on
- //
- // Now, we enter the OMP region, in order to access this privatized
- // value, we need to load from the allocated memory:
- // ```
- // omp.par.entry:
- // %priv_val_load = load { ptr, i64 }, ptr %priv_val, align 8
- // ```
- //
- // The 2 LLVM values tracked here map as follows:
- // - `mlirToLLVMPrivVar` -> `%fully_init_priv_val`
- // - `llvmOmpRegionInputPtr` -> `%priv_val_load`
- //
- // Even though they eventually refer to the same memory reference
- // (the memory being privatized), they are 2 distinct LLVM values.
- // Therefore, we need to discover their correspondence by finding
- // out if they store into and load from the same mem ref.
- auto *llvmOmpRegionInputPtrLoad =
- llvm::dyn_cast_if_present<llvm::LoadInst>(&llvmOmpRegionInput);
-
- if (llvmOmpRegionInputPtrLoad == nullptr)
- return false;
-
- for (auto &use : mlirToLLVMPrivVar->uses()) {
- auto *mlirToLLVMPrivVarStore =
- llvm::dyn_cast_if_present<llvm::StoreInst>(use.getUser());
- if (mlirToLLVMPrivVarStore &&
- (llvmOmpRegionInputPtrLoad->getPointerOperand() ==
- mlirToLLVMPrivVarStore->getPointerOperand()))
- return true;
- }
-
- return false;
- };
-
- if (!isMatch())
- continue;
-
- SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(mlirPrivatizerAttr);
- omp::PrivateClauseOp privatizer = findPrivatizer(opInst, privSym);
-
- // Clone the privatizer in case it is used by more than one parallel
- // region. The privatizer is processed in-place (see below) before it
- // gets inlined in the parallel region and therefore processing the
- // original op is dangerous.
- return {mlirPrivVar,
- clonePrivatizer(moduleTranslation, privatizer, opInst)};
- }
- }
-
- return {mlir::Value(), omp::PrivateClauseOp()};
- }();
-
- if (mlirPrivVar) {
- Region &allocRegion = mlirPrivatizerClone.getAllocRegion();
-
- // If this is a `firstprivate` clause, prepare the `omp.private` op by:
- if (mlirPrivatizerClone.getDataSharingType() ==
- omp::DataSharingClauseType::FirstPrivate) {
- auto oldAllocBackBlock = std::prev(allocRegion.end());
- omp::YieldOp oldAllocYieldOp =
- llvm::cast<omp::YieldOp>(oldAllocBackBlock->getTerminator());
-
- Region &copyRegion = mlirPrivatizerClone.getCopyRegion();
-
- mlir::IRRewriter copyCloneBuilder(&moduleTranslation.getContext());
- // 1. Cloning the `copy` region to the end of the `alloc` region.
- copyCloneBuilder.cloneRegionBefore(copyRegion, allocRegion,
- allocRegion.end());
-
- auto newCopyRegionFrontBlock = std::next(oldAllocBackBlock);
- // 2. Merging the last `alloc` block with the first block in the `copy`
- // region clone.
- // 3. Re-mapping the first argument of the `copy` region to be the
- // argument of the `alloc` region and the second argument of the `copy`
- // region to be the yielded value of the `alloc` region (this is the
- // private clone of the privatized value).
- copyCloneBuilder.mergeBlocks(&*newCopyRegionFrontBlock,
- &*oldAllocBackBlock,
- {mlirPrivatizerClone.getAllocMoldArg(),
- oldAllocYieldOp.getOperand(0)});
-
- // 4. The old terminator of the `alloc` region is not needed anymore, so
- // delete it.
- oldAllocYieldOp.erase();
- }
-
- // Replace the privatizer block argument with mlir value being privatized.
- // This way, the body of the privatizer will be changed from using the
- // region/block argument to the value being privatized.
- replaceAllUsesInRegionWith(mlirPrivatizerClone.getAllocMoldArg(),
- mlirPrivVar, allocRegion);
-
- auto oldIP = builder.saveIP();
- builder.restoreIP(allocaIP);
-
- SmallVector<llvm::Value *, 1> yieldedValues;
- if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
- moduleTranslation, &yieldedValues))) {
- opInst.emitError("failed to inline `alloc` region of an `omp.private` "
- "op in the parallel region");
- bodyGenStatus = failure();
- mlirPrivatizerClone.erase();
- } else {
- assert(yieldedValues.size() == 1);
- llvmReplacementValue = yieldedValues.front();
-
- // Keep the LLVM replacement value and the op clone in case we need to
- // emit cleanup (i.e. deallocation) logic.
- llvmPrivateVars.push_back(llvmReplacementValue);
- mlirPrivatizerClones.push_back(mlirPrivatizerClone);
- }
-
- builder.restoreIP(oldIP);
- }
-
+ auto privCB = [](InsertPointTy allocaIP, InsertPointTy codeGenIP,
+ llvm::Value &, llvm::Value &val, llvm::Value *&replVal) {
+ // tell OpenMPIRBuilder not to do anything. We handled Privatisation in
+ // bodyGenCB.
+ replVal = &val;
return codeGenIP;
};
@@ -1733,8 +1643,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
bodyGenStatus = failure();
SmallVector<Region *> privateCleanupRegions;
- llvm::transform(mlirPrivatizerClones,
- std::back_inserter(privateCleanupRegions),
+ llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
[](omp::PrivateClauseOp privatizer) {
return &privatizer.getDeallocRegion();
});
@@ -1767,9 +1676,6 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB,
ifCond, numThreads, pbKind, isCancellable));
- for (mlir::omp::PrivateClauseOp privatizerClone : mlirPrivatizerClones)
- privatizerClone.erase();
-
return bodyGenStatus;
}
@@ -1785,6 +1691,20 @@ convertOrderKind(std::optional<omp::ClauseOrderKind> o) {
llvm_unreachable("Unknown ClauseOrderKind kind");
}
+static LogicalResult simdOpSupported(omp::SimdOp op) {
+ if (!op.getLinearVars().empty() || !op.getLinearStepVars().empty())
+ return op.emitError("linear clause not yet supported");
+
+ if (!op.getPrivateVars().empty() || op.getPrivateSyms())
+ return op.emitError("privatization clauses not yet supported");
+
+ if (!op.getReductionVars().empty() || op.getReductionByref() ||
+ op.getReductionSyms())
+ return op.emitError("reduction clause not yet supported");
+
+ return success();
+}
+
/// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -1792,11 +1712,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
auto simdOp = cast<omp::SimdOp>(opInst);
auto loopOp = cast<omp::LoopNestOp>(simdOp.getWrappedLoop());
- if (!simdOp.getLinearVars().empty() || !simdOp.getLinearStepVars().empty() ||
- !simdOp.getPrivateVars().empty() || simdOp.getPrivateSyms() ||
- !simdOp.getReductionVars().empty() || simdOp.getReductionByref() ||
- simdOp.getReductionSyms())
- return opInst.emitError("unhandled clauses for translation to LLVM IR");
+ if (failed(simdOpSupported(simdOp)))
+ return failure();
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index bd861f3..6e97b2a 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1311,7 +1311,8 @@ ModuleImport::convertValues(ArrayRef<llvm::Value *> values) {
}
LogicalResult ModuleImport::convertIntrinsicArguments(
- ArrayRef<llvm::Value *> values, ArrayRef<unsigned> immArgPositions,
+ ArrayRef<llvm::Value *> values, ArrayRef<llvm::OperandBundleUse> opBundles,
+ bool requiresOpBundles, ArrayRef<unsigned> immArgPositions,
ArrayRef<StringLiteral> immArgAttrNames, SmallVectorImpl<Value> &valuesOut,
SmallVectorImpl<NamedAttribute> &attrsOut) {
assert(immArgPositions.size() == immArgAttrNames.size() &&
@@ -1341,6 +1342,35 @@ LogicalResult ModuleImport::convertIntrinsicArguments(
valuesOut.push_back(*mlirValue);
}
+ SmallVector<int> opBundleSizes;
+ SmallVector<Attribute> opBundleTagAttrs;
+ if (requiresOpBundles) {
+ opBundleSizes.reserve(opBundles.size());
+ opBundleTagAttrs.reserve(opBundles.size());
+
+ for (const llvm::OperandBundleUse &bundle : opBundles) {
+ opBundleSizes.push_back(bundle.Inputs.size());
+ opBundleTagAttrs.push_back(StringAttr::get(context, bundle.getTagName()));
+
+ for (const llvm::Use &opBundleOperand : bundle.Inputs) {
+ auto operandMlirValue = convertValue(opBundleOperand.get());
+ if (failed(operandMlirValue))
+ return failure();
+ valuesOut.push_back(*operandMlirValue);
+ }
+ }
+
+ auto opBundleSizesAttr = DenseI32ArrayAttr::get(context, opBundleSizes);
+ auto opBundleSizesAttrNameAttr =
+ StringAttr::get(context, LLVMDialect::getOpBundleSizesAttrName());
+ attrsOut.push_back({opBundleSizesAttrNameAttr, opBundleSizesAttr});
+
+ auto opBundleTagsAttr = ArrayAttr::get(context, opBundleTagAttrs);
+ auto opBundleTagsAttrNameAttr =
+ StringAttr::get(context, LLVMDialect::getOpBundleTagsAttrName());
+ attrsOut.push_back({opBundleTagsAttrNameAttr, opBundleTagsAttr});
+ }
+
return success();
}
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 6e005f9..ceb8ba3 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -55,6 +55,7 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <numeric>
#include <optional>
#define DEBUG_TYPE "llvm-dialect-to-llvm-ir"
@@ -854,8 +855,40 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
"LLVM `immArgPositions` and MLIR `immArgAttrNames` should have equal "
"length");
+ SmallVector<llvm::OperandBundleDef> opBundles;
+ size_t numOpBundleOperands = 0;
+ auto opBundleSizesAttr = cast_if_present<DenseI32ArrayAttr>(
+ intrOp->getAttr(LLVMDialect::getOpBundleSizesAttrName()));
+ auto opBundleTagsAttr = cast_if_present<ArrayAttr>(
+ intrOp->getAttr(LLVMDialect::getOpBundleTagsAttrName()));
+
+ if (opBundleSizesAttr && opBundleTagsAttr) {
+ ArrayRef<int> opBundleSizes = opBundleSizesAttr.asArrayRef();
+ assert(opBundleSizes.size() == opBundleTagsAttr.size() &&
+ "operand bundles and tags do not match");
+
+ numOpBundleOperands =
+ std::accumulate(opBundleSizes.begin(), opBundleSizes.end(), size_t(0));
+ assert(numOpBundleOperands <= intrOp->getNumOperands() &&
+ "operand bundle operands is more than the number of operands");
+
+ ValueRange operands = intrOp->getOperands().take_back(numOpBundleOperands);
+ size_t nextOperandIdx = 0;
+ opBundles.reserve(opBundleSizesAttr.size());
+
+ for (auto [opBundleTagAttr, bundleSize] :
+ llvm::zip(opBundleTagsAttr, opBundleSizes)) {
+ auto bundleTag = cast<StringAttr>(opBundleTagAttr).str();
+ auto bundleOperands = moduleTranslation.lookupValues(
+ operands.slice(nextOperandIdx, bundleSize));
+ opBundles.emplace_back(std::move(bundleTag), std::move(bundleOperands));
+ nextOperandIdx += bundleSize;
+ }
+ }
+
// Map operands and attributes to LLVM values.
- auto operands = moduleTranslation.lookupValues(intrOp->getOperands());
+ auto opOperands = intrOp->getOperands().drop_back(numOpBundleOperands);
+ auto operands = moduleTranslation.lookupValues(opOperands);
SmallVector<llvm::Value *> args(immArgPositions.size() + operands.size());
for (auto [immArgPos, immArgName] :
llvm::zip(immArgPositions, immArgAttrNames)) {
@@ -890,7 +923,7 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration(
module, intrinsic, overloadedTypes);
- return builder.CreateCall(llvmIntr, args);
+ return builder.CreateCall(llvmIntr, args, opBundles);
}
/// Given a single MLIR operation, create the corresponding LLVM IR operation
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index b861034..55b1bc9 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -684,7 +684,7 @@ func.func @collapse_static_shape_with_non_identity_layout(%arg: memref<1x1x8x8xf
// CHECK: %[[INT_TO_PTR:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
// CHECK: %[[AND:.*]] = llvm.and %[[INT_TO_PTR]], {{.*}} : i64
// CHECK: %[[CMP:.*]] = llvm.icmp "eq" %[[AND]], {{.*}} : i64
-// CHECK: "llvm.intr.assume"(%[[CMP]]) : (i1) -> ()
+// CHECK: llvm.intr.assume %[[CMP]] : i1
// CHECK: %[[LD_ADDR:.*]] = llvm.getelementptr %[[BUFF_ADDR]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32
// CHECK: %[[VAL:.*]] = llvm.load %[[LD_ADDR]] : !llvm.ptr -> f32
// CHECK: return %[[VAL]] : f32
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 9dc22ab..48dc907 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -160,7 +160,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
// CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[PTR]] : !llvm.ptr to i64
// CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
// CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
- // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
+ // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
memref.assume_alignment %0, 16 : memref<4x4xf16>
return
}
@@ -177,7 +177,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset
// CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
// CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
// CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
- // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
+ // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>>
return
}
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index f9551e3..0b7ca3f 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -18,7 +18,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
"llvm.intr.memset"(%ptr, %byte, %0) <{isVolatile = true}> : (!llvm.ptr, i8, i32) -> ()
"llvm.intr.memmove"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
"llvm.intr.memcpy"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
- "llvm.intr.assume"(%true) : (i1) -> ()
+ llvm.intr.assume %true : i1
llvm.fence release
%2 = llvm.atomicrmw add %ptr, %0 monotonic : !llvm.ptr, i32
%3 = llvm.cmpxchg %ptr, %0, %1 acq_rel monotonic : !llvm.ptr, i32
@@ -44,7 +44,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
// CHECK: "llvm.intr.memset"(%[[PTR]]
// CHECK: "llvm.intr.memmove"(%[[PTR]], %[[PTR]]
// CHECK: "llvm.intr.memcpy"(%[[PTR]], %[[PTR]]
-// CHECK: "llvm.intr.assume"
+// CHECK: llvm.intr.assume
// CHECK: llvm.fence release
// CHECK: llvm.atomicrmw add %[[PTR]], %[[CST]] monotonic
// CHECK: llvm.cmpxchg %[[PTR]], %[[CST]], %[[RES]] acq_rel monotonic
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 3062cdc..b8ce7db 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -836,3 +836,30 @@ llvm.func @test_call_intrin_with_opbundle(%arg0 : !llvm.ptr) {
llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> ()
llvm.return
}
+
+// CHECK-LABEL: @test_assume_intr_no_opbundle
+llvm.func @test_assume_intr_no_opbundle(%arg0 : !llvm.ptr) {
+ %0 = llvm.mlir.constant(1 : i1) : i1
+ // CHECK: llvm.intr.assume %0 : i1
+ llvm.intr.assume %0 : i1
+ llvm.return
+}
+
+// CHECK-LABEL: @test_assume_intr_empty_opbundle
+llvm.func @test_assume_intr_empty_opbundle(%arg0 : !llvm.ptr) {
+ %0 = llvm.mlir.constant(1 : i1) : i1
+ // CHECK: llvm.intr.assume %0 : i1
+ llvm.intr.assume %0 [] : i1
+ llvm.return
+}
+
+// CHECK-LABEL: @test_assume_intr_with_opbundles
+llvm.func @test_assume_intr_with_opbundles(%arg0 : !llvm.ptr) {
+ %0 = llvm.mlir.constant(1 : i1) : i1
+ %1 = llvm.mlir.constant(2 : i32) : i32
+ %2 = llvm.mlir.constant(3 : i32) : i32
+ %3 = llvm.mlir.constant(4 : i32) : i32
+ // CHECK: llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
+ llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
+ llvm.return
+}
diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
index 540da23..1d6cbfa 100644
--- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
@@ -203,7 +203,6 @@ func.func @memref_subview_dynamic_offset_i4(%idx : index) -> i4 {
// -----
-
func.func @negative_memref_subview_non_contiguous(%idx : index) -> i4 {
%c0 = arith.constant 0 : index
%arr = memref.alloc() : memref<40x40xi4>
@@ -543,13 +542,15 @@ func.func @memref_copy_i4(%arg0: memref<32x128xi4, 1>, %arg1: memref<32x128xi4>)
// -----
-!colMajor = memref<8x8xi4, strided<[1, 8]>>
-func.func @copy_distinct_layouts(%idx : index) -> i4 {
- %c0 = arith.constant 0 : index
- %arr = memref.alloc() : memref<8x8xi4>
- %arr2 = memref.alloc() : !colMajor
- // expected-error @+1 {{failed to legalize operation 'memref.copy' that was explicitly marked illegal}}
- memref.copy %arr, %arr2 : memref<8x8xi4> to !colMajor
- %ld = memref.load %arr2[%c0, %c0] : !colMajor
- return %ld : i4
+func.func @alloc_non_contiguous() {
+ // expected-error @+1 {{failed to legalize operation 'memref.alloc' that was explicitly marked illegal}}
+ %arr = memref.alloc() : memref<8x8xi4, strided<[1, 8]>>
+ return
+}
+
+// -----
+
+// expected-error @+1 {{failed to legalize operation 'func.func' that was explicitly marked illegal}}
+func.func @argument_non_contiguous(%arg0 : memref<8x8xi4, strided<[1, 8]>>) {
+ return
}
diff --git a/mlir/test/Dialect/MemRef/emulate-wide-int.mlir b/mlir/test/Dialect/MemRef/emulate-wide-int.mlir
index 65ac5be..994e400 100644
--- a/mlir/test/Dialect/MemRef/emulate-wide-int.mlir
+++ b/mlir/test/Dialect/MemRef/emulate-wide-int.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt --memref-emulate-wide-int="widest-int-supported=32" %s | FileCheck %s
+// RUN: mlir-opt --memref-emulate-wide-int="widest-int-supported=32" %s \
+// RUN: --split-input-file --verify-diagnostics | FileCheck %s
// Expect no conversions, i32 is supported.
// CHECK-LABEL: func @memref_i32
@@ -15,6 +16,8 @@ func.func @memref_i32() {
return
}
+// -----
+
// Expect no conversions, f64 is not an integer type.
// CHECK-LABEL: func @memref_f32
// CHECK: [[M:%.+]] = memref.alloc() : memref<4xf32, 1>
@@ -30,6 +33,8 @@ func.func @memref_f32() {
return
}
+// -----
+
// CHECK-LABEL: func @alloc_load_store_i64
// CHECK: [[C1:%.+]] = arith.constant dense<[1, 0]> : vector<2xi32>
// CHECK-NEXT: [[M:%.+]] = memref.alloc() : memref<4xvector<2xi32>, 1>
@@ -45,6 +50,7 @@ func.func @alloc_load_store_i64() {
return
}
+// -----
// CHECK-LABEL: func @alloc_load_store_i64_nontemporal
// CHECK: [[C1:%.+]] = arith.constant dense<[1, 0]> : vector<2xi32>
@@ -60,3 +66,30 @@ func.func @alloc_load_store_i64_nontemporal() {
memref.store %c1, %m[%c0] {nontemporal = true} : memref<4xi64, 1>
return
}
+
+// -----
+
+// Make sure we do not crash on unsupported types.
+func.func @alloc_i128() {
+ // expected-error@+1 {{failed to legalize operation 'memref.alloc' that was explicitly marked illegal}}
+ %m = memref.alloc() : memref<4xi128, 1>
+ return
+}
+
+// -----
+
+func.func @load_i128(%m: memref<4xi128, 1>) {
+ %c0 = arith.constant 0 : index
+ // expected-error@+1 {{failed to legalize operation 'memref.load' that was explicitly marked illegal}}
+ %v = memref.load %m[%c0] : memref<4xi128, 1>
+ return
+}
+
+// -----
+
+func.func @store_i128(%c1: i128, %m: memref<4xi128, 1>) {
+ %c0 = arith.constant 0 : index
+ // expected-error@+1 {{failed to legalize operation 'memref.store' that was explicitly marked illegal}}
+ memref.store %c1, %m[%c0] : memref<4xi128, 1>
+ return
+}
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 28a1bd2..606b111 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -630,11 +630,21 @@ define void @va_intrinsics_test(ptr %0, ptr %1, ...) {
; CHECK-LABEL: @assume
; CHECK-SAME: %[[TRUE:[a-zA-Z0-9]+]]
define void @assume(i1 %true) {
- ; CHECK: "llvm.intr.assume"(%[[TRUE]]) : (i1) -> ()
+ ; CHECK: llvm.intr.assume %[[TRUE]] : i1
call void @llvm.assume(i1 %true)
ret void
}
+; CHECK-LABEL: @assume_with_opbundles
+; CHECK-SAME: %[[TRUE:[a-zA-Z0-9]+]]
+; CHECK-SAME: %[[PTR:[a-zA-Z0-9]+]]
+define void @assume_with_opbundles(i1 %true, ptr %p) {
+ ; CHECK: %[[ALIGN:.+]] = llvm.mlir.constant(8 : i32) : i32
+ ; CHECK: llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i32)] : i1
+ call void @llvm.assume(i1 %true) ["align"(ptr %p, i32 8)]
+ ret void
+}
+
; CHECK-LABEL: @is_constant
; CHECK-SAME: %[[VAL:[a-zA-Z0-9]+]]
define void @is_constant(i32 %0) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 0634a7b..cb712eb 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -363,6 +363,21 @@ llvm.func @umin_test(%arg0: i32, %arg1: i32, %arg2: vector<8xi32>, %arg3: vector
llvm.return
}
+// CHECK-LABEL: @assume_without_opbundles
+llvm.func @assume_without_opbundles(%cond: i1) {
+ // CHECK: call void @llvm.assume(i1 %{{.+}})
+ llvm.intr.assume %cond : i1
+ llvm.return
+}
+
+// CHECK-LABEL: @assume_with_opbundles
+llvm.func @assume_with_opbundles(%cond: i1, %p: !llvm.ptr) {
+ %0 = llvm.mlir.constant(8 : i32) : i32
+ // CHECK: call void @llvm.assume(i1 %{{.+}}) [ "align"(ptr %{{.+}}, i32 8) ]
+ llvm.intr.assume %cond ["align"(%p, %0 : !llvm.ptr, i32)] : i1
+ llvm.return
+}
+
// CHECK-LABEL: @vector_reductions
llvm.func @vector_reductions(%arg0: f32, %arg1: vector<8xf32>, %arg2: vector<8xi32>) {
// CHECK: call i32 @llvm.vector.reduce.add.v8i32
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index af09814..15658ea 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -188,7 +188,7 @@ llvm.func @sadd_overflow_intr_wrong_type(%arg0 : i32, %arg1 : f32) -> !llvm.stru
llvm.func @assume_intr_wrong_type(%cond : i16) {
// expected-error @below{{op operand #0 must be 1-bit signless integer, but got 'i16'}}
- "llvm.intr.assume"(%cond) : (i16) -> ()
+ llvm.intr.assume %cond : i16
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
index 02ce6b5..79412fb 100644
--- a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
@@ -74,27 +74,38 @@ llvm.func @parallel_op_firstprivate_multi_block(%arg0: !llvm.ptr) {
// CHECK: [[PRIV_BB2]]:
// CHECK-NEXT: %[[C1:.*]] = phi i32 [ 1, %[[PRIV_BB1]] ]
// CHECK-NEXT: %[[PRIV_ALLOC:.*]] = alloca float, i32 %[[C1]], align 4
-// The entry block of the `copy` region is merged into the exit block of the
-// `alloc` region. So check for that.
+// CHECK-NEXT: br label %omp.region.cont
+
+// CHECK: omp.region.cont:
+// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ]
+// CHECK-NEXT: br label %omp.private.latealloc
+
+// CHECK: omp.private.latealloc:
+// CHECK-NEXT: br label %omp.private.copy
+
+// CHECK: omp.private.copy:
+// CHECK-NEXT: br label %omp.private.copy3
+
+// CHECK: omp.private.copy3:
// CHECK-NEXT: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
// CHECK-NEXT: br label %[[PRIV_BB3:.*]]
// Check contents of the 2nd block in the `copy` region.
// CHECK: [[PRIV_BB3]]:
-// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %[[PRIV_BB2]] ]
-// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ]
-// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC2]], align 4
+// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %omp.private.copy3 ]
+// CHECK-NEXT: %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %omp.private.copy3 ]
+// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC3]], align 4
// CHECK-NEXT: br label %[[PRIV_CONT:.*]]
// Check that the privatizer's continuation block yileds the private clone's
// address.
// CHECK: [[PRIV_CONT]]:
-// CHECK-NEXT: %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB3]] ]
+// CHECK-NEXT: %[[PRIV_ALLOC4:.*]] = phi ptr [ %[[PRIV_ALLOC3]], %[[PRIV_BB3]] ]
// CHECK-NEXT: br label %[[PAR_REG:.*]]
// Check that the body of the parallel region loads from the private clone.
// CHECK: [[PAR_REG]]:
-// CHECK: %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4
+// CHECK: %{{.*}} = load float, ptr %[[PRIV_ALLOC2]], align 4
omp.private {type = firstprivate} @multi_block.privatizer : !llvm.ptr alloc {
^bb0(%arg0: !llvm.ptr):
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index 6153e56..5407f97 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -104,6 +104,9 @@ llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) {
// CHECK: omp.par.entry:
// CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
// CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+// CHECK: br label %omp.private.latealloc
+
+// CHECK: omp.private.latealloc:
// CHECK: br label %[[PRIV_BB1:.*]]
// Check contents of the first block in the `alloc` region.
@@ -151,8 +154,7 @@ omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
// CHECK: omp.par.region:
// CHECK: br label %[[PAR_REG_BEG:.*]]
// CHECK: [[PAR_REG_BEG]]:
-// CHECK: %[[PRIVATIZER_GEP:.*]] = getelementptr double, ptr @_QQfoo, i64 111
-// CHECK: call void @bar(ptr %[[PRIVATIZER_GEP]])
+// CHECK: call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 111))
// CHECK: call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 222))
llvm.func @lower_region_with_addressof() {
%0 = llvm.mlir.constant(1 : i64) : i64
diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td
index 31dd537..a03d0b4 100644
--- a/mlir/test/mlir-tblgen/op-decl-and-defs.td
+++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td
@@ -208,6 +208,11 @@ def NS_FOp : NS_Op<"op_with_all_types_constraint",
// CHECK-LABEL: class FOp :
// CHECK: static ::llvm::LogicalResult inferReturnTypes
+// DEFS: void FOp::build(::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState, ::mlir::Value a) {
+// DEFS: if (::mlir::succeeded(FOp::inferReturnTypes(odsBuilder.getContext(),
+// DEFS: else
+// DEFS: ::mlir::detail::reportFatalInferReturnTypesError(odsState);
+
def NS_GOp : NS_Op<"op_with_fixed_return_type", []> {
let arguments = (ins AnyType:$a);
let results = (outs I32:$b);
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index 8716667..04f81a4 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -106,9 +106,7 @@ static bool verifyArgument(const DagInit *arguments, StringRef argName,
const Init *argInit) {
auto range = zip_equal(arguments->getArgNames(), arguments->getArgs());
return llvm::any_of(
- range,
- [&](std::tuple<const llvm::StringInit *const &, const llvm::Init *const &>
- v) {
+ range, [&](std::tuple<const llvm::StringInit *, const llvm::Init *> v) {
return std::get<0>(v)->getAsUnquotedString() == argName &&
std::get<1>(v) == argInit;
});
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index ce2b6ed..71fa501 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -2503,7 +2503,8 @@ void OpEmitter::genSeparateArgParamBuilder() {
{1}.regions, inferredReturnTypes)))
{1}.addTypes(inferredReturnTypes);
else
- ::llvm::report_fatal_error("Failed to infer result type(s).");)",
+ ::mlir::detail::reportFatalInferReturnTypesError({1});
+ )",
opClass.getClassName(), builderOpState);
return;
}
diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index c587636..b76d8f4 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -786,12 +786,6 @@ void BlockGenerator::generateScalarStores(
Builder.GetInsertBlock())) &&
"Domination violation");
- // The new Val might have a different type than the old Val due to
- // ScalarEvolution looking through bitcasts.
- Address = Builder.CreateBitOrPointerCast(
- Address, Val->getType()->getPointerTo(
- Address->getType()->getPointerAddressSpace()));
-
Builder.CreateStore(Val, Address);
});
}
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index 3f07f02..d76f625 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -1050,8 +1050,6 @@ Value *IslNodeBuilder::preloadUnconditionally(__isl_take isl_set *AccessRange,
auto *Ptr = AddressValue;
auto Name = Ptr->getName();
- auto AS = Ptr->getType()->getPointerAddressSpace();
- Ptr = Builder.CreatePointerCast(Ptr, Ty->getPointerTo(AS), Name + ".cast");
PreloadVal = Builder.CreateLoad(Ty, Ptr, Name + ".load");
if (LoadInst *PreloadInst = dyn_cast<LoadInst>(PreloadVal))
PreloadInst->setAlignment(cast<LoadInst>(AccInst)->getAlign());
diff --git a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
index cd440b2..b98416a 100644
--- a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
+++ b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
@@ -183,7 +183,7 @@ Value *ParallelLoopGeneratorGOMP::createCallGetWorkItem(Value *LBPtr,
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- Type *Params[] = {LongType->getPointerTo(), LongType->getPointerTo()};
+ Type *Params[] = {Builder.getPtrTy(0), Builder.getPtrTy(0)};
FunctionType *Ty = FunctionType::get(Builder.getInt8Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 3849341..7057f5d 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -1221,12 +1221,31 @@ cc_library(
],
)
+gentbl_cc_library(
+ name = "DynamicLoaderMacOSXDYLDProperties",
+ strip_include_prefix = "DynamicLoader/MacOSX-DYLD",
+ tbl_outs = [
+ (
+ ["-gen-lldb-property-defs"],
+ "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.inc",
+ ),
+ (
+ ["-gen-lldb-property-enum-defs"],
+ "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinPropertiesEnum.inc",
+ ),
+ ],
+ tblgen = "//lldb:lldb-tblgen",
+ td_file = "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td",
+ deps = ["//lldb:CoreTdFiles"],
+)
+
cc_library(
name = "PluginDynamicLoaderMacOSXDYLD",
srcs = glob(["DynamicLoader/MacOSX-DYLD/*.cpp"]),
hdrs = glob(["DynamicLoader/MacOSX-DYLD/*.h"]),
include_prefix = "Plugins",
deps = [
+ ":DynamicLoaderMacOSXDYLDProperties",
":PluginObjCRuntime",
":PluginTypeSystemClang",
":PluginTypeSystemClangHeaders",
@@ -1239,6 +1258,7 @@ cc_library(
"//lldb:Target",
"//lldb:TargetHeaders",
"//lldb:Utility",
+ "//llvm:Support",
"//llvm:TargetParser",
],
)